src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include "lisp.h"
  38 #include "character.h"
  39 #include "buffer.h"
  40 #include "charset.h"
  41 #include "composite.h"
  42 #include "disptab.h"
  43
  44 #else  /* not emacs */
  45
  46 #include "mulelib.h"
  47
  48 #endif /* emacs */
  49
  50 Lisp_Object Qcharacterp;
  51
  52 /* Vector of translation table ever defined.
  53    ID of a translation table is used to index this vector.  */
  54 Lisp_Object Vtranslation_table_vector;
  55
  56 /* A char-table for characters which may invoke auto-filling.  */
  57 Lisp_Object Vauto_fill_chars;
  58
  59 Lisp_Object Qauto_fill_chars;
  60
  61 /* Char-table of information about which character to unify to which
  62    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  63 Lisp_Object Vchar_unify_table;
  64
  65 /* A char-table.  An element is non-nil iff the corresponding
  66    character has a printable glyph.  */
  67 Lisp_Object Vprintable_chars;
  68
  69 /* A char-table.  An elemnent is a column-width of the corresponding
  70    character.  */
  71 Lisp_Object Vchar_width_table;
  72
  73 /* A char-table.  An element is a symbol indicating the direction
  74    property of corresponding character.  */
  75 Lisp_Object Vchar_direction_table;
  76
  77 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  78 unsigned char *_fetch_multibyte_char_p;
  79
  80 /* Char table of scripts.  */
  81 Lisp_Object Vchar_script_table;
  82
  83 /* Alist of scripts vs representative characters.  */
  84 Lisp_Object Vscript_representative_chars;
  85
  86 static Lisp_Object Qchar_script_table;
  87
  88 Lisp_Object Vunicode_category_table;
  89
  90 /* Mapping table from unibyte chars to multibyte chars.  */
  91 int unibyte_to_multibyte_table[256];
  92
  93 \f
  94
  95 /* If character code C has modifier masks, reflect them to the
  96    character code if possible.  Return the resulting code.  */
  97
  98 int
  99 char_resolve_modifier_mask (c)
 100      int c;
 101 {
 102   /* A non-ASCII character can't reflect modifier bits to the code.  */
 103   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 104     return c;
 105
 106   /* For Meta, Shift, and Control modifiers, we need special care.  */
 107   if (c & CHAR_SHIFT)
 108     {
 109       /* Shift modifier is valid only with [A-Za-z].  */
 110       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 111         c &= ~CHAR_SHIFT;
 112       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 113         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 114       /* Shift modifier for control characters and SPC is ignored.  */
 115       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 116         c &= ~CHAR_SHIFT;
 117     }
 118   if (c & CHAR_CTL)
 119     {
 120       /* Simulate the code in lread.c.  */
 121       /* Allow `\C- ' and `\C-?'.  */
 122       if ((c & 0377) == ' ')
 123         c &= ~0177 & ~ CHAR_CTL;
 124       else if ((c & 0377) == '?')
 125         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 126       /* ASCII control chars are made from letters (both cases),
 127          as well as the non-letters within 0100...0137.  */
 128       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 129         c &= (037 | (~0177 & ~CHAR_CTL));
 130       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 131         c &= (037 | (~0177 & ~CHAR_CTL));
 132     }
 133   if (c & CHAR_META)
 134     {
 135       /* Move the meta bit to the right place for a string.  */
 136       c = (c & ~CHAR_META) | 0x80;
 137     }
 138
 139   return c;
 140 }
 141
 142
 143 /* Store multibyte form of character C at P.  If C has modifier bits,
 144    handle them appropriately.  */
 145
 146 int
 147 char_string (c, p)
 148      unsigned c;
 149      unsigned char *p;
 150 {
 151   int bytes;
 152
 153   if (c & CHAR_MODIFIER_MASK)
 154     {
 155       c = (unsigned) char_resolve_modifier_mask ((int) c);
 156       /* If C still has any modifier bits, just ignore it.  */
 157       c &= ~CHAR_MODIFIER_MASK;
 158     }
 159
 160   MAYBE_UNIFY_CHAR (c);
 161
 162   if (c <= MAX_3_BYTE_CHAR)
 163     {
 164       bytes = CHAR_STRING (c, p);
 165     }
 166   else if (c <= MAX_4_BYTE_CHAR)
 167     {
 168       p[0] = (0xF0 | (c >> 18));
 169       p[1] = (0x80 | ((c >> 12) & 0x3F));
 170       p[2] = (0x80 | ((c >> 6) & 0x3F));
 171       p[3] = (0x80 | (c & 0x3F));
 172       bytes = 4;
 173     }
 174   else if (c <= MAX_5_BYTE_CHAR)
 175     {
 176       p[0] = 0xF8;
 177       p[1] = (0x80 | ((c >> 18) & 0x0F));
 178       p[2] = (0x80 | ((c >> 12) & 0x3F));
 179       p[3] = (0x80 | ((c >> 6) & 0x3F));
 180       p[4] = (0x80 | (c & 0x3F));
 181       bytes = 5;
 182     }
 183   else if (c <= MAX_CHAR)
 184     {
 185       c = CHAR_TO_BYTE8 (c);
 186       bytes = BYTE8_STRING (c, p);
 187     }
 188   else
 189     error ("Invalid character: %d", c);
 190
 191   return bytes;
 192 }
 193
 194
 195 /* Return a character whose multibyte form is at P.  Set LEN is not
 196    NULL, it must be a pointer to integer.  In that case, set *LEN to
 197    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 198    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 199    the ending address (i.e. the starting address of the next
 200    character) of the multibyte form.  */
 201
 202 int
 203 string_char (p, advanced, len)
 204      const unsigned char *p;
 205      const unsigned char **advanced;
 206      int *len;
 207 {
 208   int c;
 209   const unsigned char *saved_p = p;
 210
 211   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 212     {
 213       c = STRING_CHAR_ADVANCE (p);
 214     }
 215   else if (! (*p & 0x08))
 216     {
 217       c = ((((p)[0] & 0xF) << 18)
 218            | (((p)[1] & 0x3F) << 12)
 219            | (((p)[2] & 0x3F) << 6)
 220            | ((p)[3] & 0x3F));
 221       p += 4;
 222     }
 223   else
 224     {
 225       c = ((((p)[1] & 0x3F) << 18)
 226            | (((p)[2] & 0x3F) << 12)
 227            | (((p)[3] & 0x3F) << 6)
 228            | ((p)[4] & 0x3F));
 229       p += 5;
 230     }
 231
 232   MAYBE_UNIFY_CHAR (c);
 233
 234   if (len)
 235     *len = p - saved_p;
 236   if (advanced)
 237     *advanced = p;
 238   return c;
 239 }
 240
 241
 242 /* Translate character C by translation table TABLE.  If C is
 243    negative, translate a character specified by CHARSET and CODE.  If
 244    no translation is found in TABLE, return the untranslated
 245    character.  If TABLE is a list, elements are char tables.  In this
 246    case, translace C by all tables.  */
 247
 248 int
 249 translate_char (table, c)
 250      Lisp_Object table;
 251      int c;
 252 {
 253   if (CHAR_TABLE_P (table))
 254     {
 255       Lisp_Object ch;
 256
 257       ch = CHAR_TABLE_REF (table, c);
 258       if (CHARACTERP (ch))
 259         c = XINT (ch);
 260     }
 261   else
 262     {
 263       for (; CONSP (table); table = XCDR (table))
 264         c = translate_char (XCAR (table), c);
 265     }
 266   return c;
 267 }
 268
 269 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 270    them, return (C & 0xFF).
 271
 272    The argument REV_TBL is now ignored.  It will be removed in the
 273    future.  */
 274
 275 int
 276 multibyte_char_to_unibyte (c, rev_tbl)
 277      int c;
 278      Lisp_Object rev_tbl;
 279 {
 280   if (c < 0x80)
 281     return c;
 282   if (CHAR_BYTE8_P (c))
 283     return CHAR_TO_BYTE8 (c);
 284   return (c & 0xFF);
 285 }
 286
 287 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 288    by charset_unibyte.  */
 289
 290 int
 291 multibyte_char_to_unibyte_safe (c)
 292      int c;
 293 {
 294   if (c < 0x80)
 295     return c;
 296   if (CHAR_BYTE8_P (c))
 297     return CHAR_TO_BYTE8 (c);
 298   return -1;
 299 }
 300
 301 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 302        doc: /* Return non-nil if OBJECT is a character.  */)
 303      (object, ignore)
 304      Lisp_Object object, ignore;
 305 {
 306   return (CHARACTERP (object) ? Qt : Qnil);
 307 }
 308
 309 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 310        doc: /* Return the character of the maximum code.  */)
 311      ()
 312 {
 313   return make_number (MAX_CHAR);
 314 }
 315
 316 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 317        Sunibyte_char_to_multibyte, 1, 1, 0,
 318        doc: /* Convert the byte CH to multibyte character.  */)
 319      (ch)
 320      Lisp_Object ch;
 321 {
 322   int c;
 323
 324   CHECK_CHARACTER (ch);
 325   c = XFASTINT (ch);
 326   if (c >= 0x100)
 327     error ("Not a unibyte character: %d", c);
 328   if (c >= 0x80)
 329     c = BYTE8_TO_CHAR (c);
 330   return make_number (c);
 331 }
 332
 333 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 334        Smultibyte_char_to_unibyte, 1, 1, 0,
 335        doc: /* Convert the multibyte character CH to a byte.
 336 If the multibyte character does not represent a byte, return -1.  */)
 337      (ch)
 338      Lisp_Object ch;
 339 {
 340   int cm;
 341
 342   CHECK_CHARACTER (ch);
 343   cm = XFASTINT (ch);
 344   if (cm < 256)
 345     /* Can't distinguish a byte read from a unibyte buffer from
 346        a latin1 char, so let's let it slide.  */
 347     return ch;
 348   else
 349     {
 350       int cu = CHAR_TO_BYTE_SAFE (cm);
 351       return make_number (cu);
 352     }
 353 }
 354
 355 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 356        doc: /* Return 1 regardless of the argument CHAR.
 357 This is now an obsolete function.  We keep it just for backward compatibility.
 358 usage: (char-bytes CHAR)  */)
 359      (ch)
 360      Lisp_Object ch;
 361 {
 362   CHECK_CHARACTER (ch);
 363   return make_number (1);
 364 }
 365
 366 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 367        doc: /* Return width of CHAR when displayed in the current buffer.
 368 The width is measured by how many columns it occupies on the screen.
 369 Tab is taken to occupy `tab-width' columns.
 370 usage: (char-width CHAR)  */)
 371      (ch)
 372        Lisp_Object ch;
 373 {
 374   Lisp_Object disp;
 375   int c, width;
 376   struct Lisp_Char_Table *dp = buffer_display_table ();
 377
 378   CHECK_CHARACTER (ch);
 379   c = XINT (ch);
 380
 381   /* Get the way the display table would display it.  */
 382   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 383
 384   if (VECTORP (disp))
 385     width = ASIZE (disp);
 386   else
 387     width = CHAR_WIDTH (c);
 388
 389   return make_number (width);
 390 }
 391
 392 /* Return width of string STR of length LEN when displayed in the
 393    current buffer.  The width is measured by how many columns it
 394    occupies on the screen.  If PRECISION > 0, return the width of
 395    longest substring that doesn't exceed PRECISION, and set number of
 396    characters and bytes of the substring in *NCHARS and *NBYTES
 397    respectively.  */
 398
 399 int
 400 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
 401 {
 402   int i = 0, i_byte = 0;
 403   int width = 0;
 404   struct Lisp_Char_Table *dp = buffer_display_table ();
 405
 406   while (i_byte < len)
 407     {
 408       int bytes, thiswidth;
 409       Lisp_Object val;
 410       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 411
 412       if (dp)
 413         {
 414           val = DISP_CHAR_VECTOR (dp, c);
 415           if (VECTORP (val))
 416             thiswidth = XVECTOR (val)->size;
 417           else
 418             thiswidth = CHAR_WIDTH (c);
 419         }
 420       else
 421         {
 422           thiswidth = CHAR_WIDTH (c);
 423         }
 424
 425       if (precision > 0
 426           && (width + thiswidth > precision))
 427         {
 428           *nchars = i;
 429           *nbytes = i_byte;
 430           return width;
 431         }
 432       i++;
 433       i_byte += bytes;
 434       width += thiswidth;
 435   }
 436
 437   if (precision > 0)
 438     {
 439       *nchars = i;
 440       *nbytes = i_byte;
 441     }
 442
 443   return width;
 444 }
 445
 446 /* Return width of string STR of length LEN when displayed in the
 447    current buffer.  The width is measured by how many columns it
 448    occupies on the screen.  */
 449
 450 int
 451 strwidth (str, len)
 452      unsigned char *str;
 453      int len;
 454 {
 455   return c_string_width (str, len, -1, NULL, NULL);
 456 }
 457
 458 /* Return width of Lisp string STRING when displayed in the current
 459    buffer.  The width is measured by how many columns it occupies on
 460    the screen while paying attention to compositions.  If PRECISION >
 461    0, return the width of longest substring that doesn't exceed
 462    PRECISION, and set number of characters and bytes of the substring
 463    in *NCHARS and *NBYTES respectively.  */
 464
 465 int
 466 lisp_string_width (string, precision, nchars, nbytes)
 467      Lisp_Object string;
 468      int precision, *nchars, *nbytes;
 469 {
 470   int len = SCHARS (string);
 471   /* This set multibyte to 0 even if STRING is multibyte when it
 472      contains only ascii and eight-bit-graphic, but that's
 473      intentional.  */
 474   int multibyte = len < SBYTES (string);
 475   unsigned char *str = SDATA (string);
 476   int i = 0, i_byte = 0;
 477   int width = 0;
 478   struct Lisp_Char_Table *dp = buffer_display_table ();
 479
 480   while (i < len)
 481     {
 482       int chars, bytes, thiswidth;
 483       Lisp_Object val;
 484       int cmp_id;
 485       EMACS_INT ignore, end;
 486
 487       if (find_composition (i, -1, &ignore, &end, &val, string)
 488           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 489               >= 0))
 490         {
 491           thiswidth = composition_table[cmp_id]->width;
 492           chars = end - i;
 493           bytes = string_char_to_byte (string, end) - i_byte;
 494         }
 495       else
 496         {
 497           int c;
 498
 499           if (multibyte)
 500             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 501           else
 502             c = str[i_byte], bytes = 1;
 503           chars = 1;
 504           if (dp)
 505             {
 506               val = DISP_CHAR_VECTOR (dp, c);
 507               if (VECTORP (val))
 508                 thiswidth = XVECTOR (val)->size;
 509               else
 510                 thiswidth = CHAR_WIDTH (c);
 511             }
 512           else
 513             {
 514               thiswidth = CHAR_WIDTH (c);
 515             }
 516         }
 517
 518       if (precision > 0
 519           && (width + thiswidth > precision))
 520         {
 521           *nchars = i;
 522           *nbytes = i_byte;
 523           return width;
 524         }
 525       i += chars;
 526       i_byte += bytes;
 527       width += thiswidth;
 528   }
 529
 530   if (precision > 0)
 531     {
 532       *nchars = i;
 533       *nbytes = i_byte;
 534     }
 535
 536   return width;
 537 }
 538
 539 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 540        doc: /* Return width of STRING when displayed in the current buffer.
 541 Width is measured by how many columns it occupies on the screen.
 542 When calculating width of a multibyte character in STRING,
 543 only the base leading-code is considered; the validity of
 544 the following bytes is not checked.  Tabs in STRING are always
 545 taken to occupy `tab-width' columns.
 546 usage: (string-width STRING)  */)
 547      (str)
 548      Lisp_Object str;
 549 {
 550   Lisp_Object val;
 551
 552   CHECK_STRING (str);
 553   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 554   return val;
 555 }
 556
 557 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 558        doc: /* Return the direction of CHAR.
 559 The returned value is 0 for left-to-right and 1 for right-to-left.
 560 usage: (char-direction CHAR)  */)
 561      (ch)
 562      Lisp_Object ch;
 563 {
 564   int c;
 565
 566   CHECK_CHARACTER (ch);
 567   c = XINT (ch);
 568   return CHAR_TABLE_REF (Vchar_direction_table, c);
 569 }
 570
 571 /* Return the number of characters in the NBYTES bytes at PTR.
 572    This works by looking at the contents and checking for multibyte
 573    sequences while assuming that there's no invalid sequence.
 574    However, if the current buffer has enable-multibyte-characters =
 575    nil, we treat each byte as a character.  */
 576
 577 EMACS_INT
 578 chars_in_text (ptr, nbytes)
 579      const unsigned char *ptr;
 580      EMACS_INT nbytes;
 581 {
 582   /* current_buffer is null at early stages of Emacs initialization.  */
 583   if (current_buffer == 0
 584       || NILP (current_buffer->enable_multibyte_characters))
 585     return nbytes;
 586
 587   return multibyte_chars_in_text (ptr, nbytes);
 588 }
 589
 590 /* Return the number of characters in the NBYTES bytes at PTR.
 591    This works by looking at the contents and checking for multibyte
 592    sequences while assuming that there's no invalid sequence.  It
 593    ignores enable-multibyte-characters.  */
 594
 595 EMACS_INT
 596 multibyte_chars_in_text (ptr, nbytes)
 597      const unsigned char *ptr;
 598      EMACS_INT nbytes;
 599 {
 600   const unsigned char *endp = ptr + nbytes;
 601   int chars = 0;
 602
 603   while (ptr < endp)
 604     {
 605       int len = MULTIBYTE_LENGTH (ptr, endp);
 606
 607       if (len == 0)
 608         abort ();
 609       ptr += len;
 610       chars++;
 611     }
 612
 613   return chars;
 614 }
 615
 616 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 617    characters and bytes in it, and store them in *NCHARS and *NBYTES
 618    respectively.  On counting bytes, pay attention to that 8-bit
 619    characters not constructing a valid multibyte sequence are
 620    represented by 2-byte in a multibyte text.  */
 621
 622 void
 623 parse_str_as_multibyte (str, len, nchars, nbytes)
 624      const unsigned char *str;
 625      int len, *nchars, *nbytes;
 626 {
 627   const unsigned char *endp = str + len;
 628   int n, chars = 0, bytes = 0;
 629
 630   if (len >= MAX_MULTIBYTE_LENGTH)
 631     {
 632       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 633       while (str < adjusted_endp)
 634         {
 635           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 636             str += n, bytes += n;
 637           else
 638             str++, bytes += 2;
 639           chars++;
 640         }
 641     }
 642   while (str < endp)
 643     {
 644       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 645         str += n, bytes += n;
 646       else
 647         str++, bytes += 2;
 648       chars++;
 649     }
 650
 651   *nchars = chars;
 652   *nbytes = bytes;
 653   return;
 654 }
 655
 656 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 657    It actually converts only such 8-bit characters that don't contruct
 658    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 659    NCHARS is nonzero, set *NCHARS to the number of characters in the
 660    text.  It is assured that we can use LEN bytes at STR as a work
 661    area and that is enough.  Return the number of bytes of the
 662    resulting text.  */
 663
 664 int
 665 str_as_multibyte (str, len, nbytes, nchars)
 666      unsigned char *str;
 667      int len, nbytes, *nchars;
 668 {
 669   unsigned char *p = str, *endp = str + nbytes;
 670   unsigned char *to;
 671   int chars = 0;
 672   int n;
 673
 674   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 675     {
 676       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 677       while (p < adjusted_endp
 678              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 679         p += n, chars++;
 680     }
 681   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 682     p += n, chars++;
 683   if (nchars)
 684     *nchars = chars;
 685   if (p == endp)
 686     return nbytes;
 687
 688   to = p;
 689   nbytes = endp - p;
 690   endp = str + len;
 691   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 692   p = endp - nbytes;
 693
 694   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 695     {
 696       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 697       while (p < adjusted_endp)
 698         {
 699           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 700             {
 701               while (n--)
 702                 *to++ = *p++;
 703             }
 704           else
 705             {
 706               int c = *p++;
 707               c = BYTE8_TO_CHAR (c);
 708               to += CHAR_STRING (c, to);
 709             }
 710         }
 711       chars++;
 712     }
 713   while (p < endp)
 714     {
 715       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 716         {
 717           while (n--)
 718             *to++ = *p++;
 719         }
 720       else
 721         {
 722           int c = *p++;
 723           c = BYTE8_TO_CHAR (c);
 724           to += CHAR_STRING (c, to);
 725         }
 726       chars++;
 727     }
 728   if (nchars)
 729     *nchars = chars;
 730   return (to - str);
 731 }
 732
 733 /* Parse unibyte string at STR of LEN bytes, and return the number of
 734    bytes it may ocupy when converted to multibyte string by
 735    `str_to_multibyte'.  */
 736
 737 int
 738 parse_str_to_multibyte (str, len)
 739      unsigned char *str;
 740      int len;
 741 {
 742   unsigned char *endp = str + len;
 743   int bytes;
 744
 745   for (bytes = 0; str < endp; str++)
 746     bytes += (*str < 0x80) ? 1 : 2;
 747   return bytes;
 748 }
 749
 750
 751 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 752    that contains the same single-byte characters.  It actually
 753    converts all 8-bit characters to multibyte forms.  It is assured
 754    that we can use LEN bytes at STR as a work area and that is
 755    enough.  */
 756
 757 int
 758 str_to_multibyte (str, len, bytes)
 759      unsigned char *str;
 760      int len, bytes;
 761 {
 762   unsigned char *p = str, *endp = str + bytes;
 763   unsigned char *to;
 764
 765   while (p < endp && *p < 0x80) p++;
 766   if (p == endp)
 767     return bytes;
 768   to = p;
 769   bytes = endp - p;
 770   endp = str + len;
 771   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 772   p = endp - bytes;
 773   while (p < endp)
 774     {
 775       int c = *p++;
 776
 777       if (c >= 0x80)
 778         c = BYTE8_TO_CHAR (c);
 779       to += CHAR_STRING (c, to);
 780     }
 781   return (to - str);
 782 }
 783
 784 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 785    actually converts characters in the range 0x80..0xFF to
 786    unibyte.  */
 787
 788 int
 789 str_as_unibyte (str, bytes)
 790      unsigned char *str;
 791      int bytes;
 792 {
 793   const unsigned char *p = str, *endp = str + bytes;
 794   unsigned char *to;
 795   int c, len;
 796
 797   while (p < endp)
 798     {
 799       c = *p;
 800       len = BYTES_BY_CHAR_HEAD (c);
 801       if (CHAR_BYTE8_HEAD_P (c))
 802         break;
 803       p += len;
 804     }
 805   to = str + (p - str);
 806   while (p < endp)
 807     {
 808       c = *p;
 809       len = BYTES_BY_CHAR_HEAD (c);
 810       if (CHAR_BYTE8_HEAD_P (c))
 811         {
 812           c = STRING_CHAR_ADVANCE (p);
 813           *to++ = CHAR_TO_BYTE8 (c);
 814         }
 815       else
 816         {
 817           while (len--) *to++ = *p++;
 818         }
 819     }
 820   return (to - str);
 821 }
 822
 823 /* Convert eight-bit chars in SRC (in multibyte form) to the
 824    corresponding byte and store in DST.  CHARS is the number of
 825    characters in SRC.  The value is the number of bytes stored in DST.
 826    Usually, the value is the same as CHARS, but is less than it if SRC
 827    contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
 828    is nonzero, a Latin-1 character is accepted and converted to a byte
 829    of that character code.
 830    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 831
 832 EMACS_INT
 833 str_to_unibyte (src, dst, chars, accept_latin_1)
 834      const unsigned char *src;
 835      unsigned char *dst;
 836      EMACS_INT chars;
 837      int accept_latin_1;
 838 {
 839   EMACS_INT i;
 840
 841   for (i = 0; i < chars; i++)
 842     {
 843       int c = STRING_CHAR_ADVANCE (src);
 844
 845       if (CHAR_BYTE8_P (c))
 846         c = CHAR_TO_BYTE8 (c);
 847       else if (! ASCII_CHAR_P (c)
 848                && (! accept_latin_1 || c >= 0x100))
 849         return i;
 850       *dst++ = c;
 851     }
 852   return i;
 853 }
 854
 855
 856 int
 857 string_count_byte8 (string)
 858      Lisp_Object string;
 859 {
 860   int multibyte = STRING_MULTIBYTE (string);
 861   int nbytes = SBYTES (string);
 862   unsigned char *p = SDATA (string);
 863   unsigned char *pend = p + nbytes;
 864   int count = 0;
 865   int c, len;
 866
 867   if (multibyte)
 868     while (p < pend)
 869       {
 870         c = *p;
 871         len = BYTES_BY_CHAR_HEAD (c);
 872
 873         if (CHAR_BYTE8_HEAD_P (c))
 874           count++;
 875         p += len;
 876       }
 877   else
 878     while (p < pend)
 879       {
 880         if (*p++ >= 0x80)
 881           count++;
 882       }
 883   return count;
 884 }
 885
 886
 887 Lisp_Object
 888 string_escape_byte8 (string)
 889      Lisp_Object string;
 890 {
 891   int nchars = SCHARS (string);
 892   int nbytes = SBYTES (string);
 893   int multibyte = STRING_MULTIBYTE (string);
 894   int byte8_count;
 895   const unsigned char *src, *src_end;
 896   unsigned char *dst;
 897   Lisp_Object val;
 898   int c, len;
 899
 900   if (multibyte && nchars == nbytes)
 901     return string;
 902
 903   byte8_count = string_count_byte8 (string);
 904
 905   if (byte8_count == 0)
 906     return string;
 907
 908   if (multibyte)
 909     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 910     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 911                                         nbytes + byte8_count * 2);
 912   else
 913     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 914     val = make_uninit_string (nbytes + byte8_count * 3);
 915
 916   src = SDATA (string);
 917   src_end = src + nbytes;
 918   dst = SDATA (val);
 919   if (multibyte)
 920     while (src < src_end)
 921       {
 922         c = *src;
 923         len = BYTES_BY_CHAR_HEAD (c);
 924
 925         if (CHAR_BYTE8_HEAD_P (c))
 926           {
 927             c = STRING_CHAR_ADVANCE (src);
 928             c = CHAR_TO_BYTE8 (c);
 929             sprintf ((char *) dst, "\\%03o", c);
 930             dst += 4;
 931           }
 932         else
 933           while (len--) *dst++ = *src++;
 934       }
 935   else
 936     while (src < src_end)
 937       {
 938         c = *src++;
 939         if (c >= 0x80)
 940           {
 941             sprintf ((char *) dst, "\\%03o", c);
 942             dst += 4;
 943           }
 944         else
 945           *dst++ = c;
 946       }
 947   return val;
 948 }
 949
 950 \f
 951 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 952        doc: /*
 953 Concatenate all the argument characters and make the result a string.
 954 usage: (string &rest CHARACTERS)  */)
 955      (n, args)
 956      int n;
 957      Lisp_Object *args;
 958 {
 959   int i;
 960   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 961   unsigned char *p = buf;
 962   int c;
 963
 964   for (i = 0; i < n; i++)
 965     {
 966       CHECK_CHARACTER (args[i]);
 967       c = XINT (args[i]);
 968       p += CHAR_STRING (c, p);
 969     }
 970
 971   return make_string_from_bytes ((char *) buf, n, p - buf);
 972 }
 973
 974 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 975        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 976 usage: (unibyte-string &rest BYTES)  */)
 977      (n, args)
 978      int n;
 979      Lisp_Object *args;
 980 {
 981   int i;
 982   unsigned char *buf = (unsigned char *) alloca (n);
 983   unsigned char *p = buf;
 984   unsigned c;
 985
 986   for (i = 0; i < n; i++)
 987     {
 988       CHECK_NATNUM (args[i]);
 989       c = XFASTINT (args[i]);
 990       if (c >= 256)
 991         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 992       *p++ = c;
 993     }
 994
 995   return make_string_from_bytes ((char *) buf, n, p - buf);
 996 }
 997
 998 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 999        Schar_resolve_modifiers, 1, 1, 0,
1000        doc: /* Resolve modifiers in the character CHAR.
1001 The value is a character with modifiers resolved into the character
1002 code.  Unresolved modifiers are kept in the value.
1003 usage: (char-resolve-modifiers CHAR)  */)
1004      (character)
1005      Lisp_Object character;
1006 {
1007   int c;
1008
1009   CHECK_NUMBER (character);
1010   c = XINT (character);
1011   return make_number (char_resolve_modifier_mask (c));
1012 }
1013
1014 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1015        doc: /* Return a byte value of a character at point.
1016 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1017 a byte value.
1018 Optional 2nd arg STRING, if non-nil, is a string of which first
1019 character is a target to get a byte value.  In this case, POSITION, if
1020 non-nil, is an index of a target character in the string.
1021
1022 If the current buffer (or STRING) is multibyte, and the target
1023 character is not ASCII nor 8-bit character, an error is signalled.  */)
1024      (position, string)
1025      Lisp_Object position, string;
1026 {
1027   int c;
1028   EMACS_INT pos;
1029   unsigned char *p;
1030
1031   if (NILP (string))
1032     {
1033       if (NILP (position))
1034         {
1035           p = PT_ADDR;
1036         }
1037       else
1038         {
1039           CHECK_NUMBER_COERCE_MARKER (position);
1040           if (XINT (position) < BEGV || XINT (position) >= ZV)
1041             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1042           pos = XFASTINT (position);
1043           p = CHAR_POS_ADDR (pos);
1044         }
1045       if (NILP (current_buffer->enable_multibyte_characters))
1046         return make_number (*p);
1047     }
1048   else
1049     {
1050       CHECK_STRING (string);
1051       if (NILP (position))
1052         {
1053           p = SDATA (string);
1054         }
1055       else
1056         {
1057           CHECK_NATNUM (position);
1058           if (XINT (position) >= SCHARS (string))
1059             args_out_of_range (string, position);
1060           pos = XFASTINT (position);
1061           p = SDATA (string) + string_char_to_byte (string, pos);
1062         }
1063       if (! STRING_MULTIBYTE (string))
1064         return make_number (*p);
1065     }
1066   c = STRING_CHAR (p, 0);
1067   if (CHAR_BYTE8_P (c))
1068     c = CHAR_TO_BYTE8 (c);
1069   else if (! ASCII_CHAR_P (c))
1070     error ("Not an ASCII nor an 8-bit character: %d", c);
1071   return make_number (c);
1072 }
1073
1074
1075 void
1076 init_character_once ()
1077 {
1078 }
1079
1080 #ifdef emacs
1081
1082 void
1083 syms_of_character ()
1084 {
1085   DEFSYM (Qcharacterp, "characterp");
1086   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1087
1088   staticpro (&Vchar_unify_table);
1089   Vchar_unify_table = Qnil;
1090
1091   defsubr (&Smax_char);
1092   defsubr (&Scharacterp);
1093   defsubr (&Sunibyte_char_to_multibyte);
1094   defsubr (&Smultibyte_char_to_unibyte);
1095   defsubr (&Schar_bytes);
1096   defsubr (&Schar_width);
1097   defsubr (&Sstring_width);
1098   defsubr (&Schar_direction);
1099   defsubr (&Sstring);
1100   defsubr (&Sunibyte_string);
1101   defsubr (&Schar_resolve_modifiers);
1102   defsubr (&Sget_byte);
1103
1104   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1105                doc: /*
1106 Vector recording all translation tables ever defined.
1107 Each element is a pair (SYMBOL . TABLE) relating the table to the
1108 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1109   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1110
1111   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1112                doc: /*
1113 A char-table for characters which invoke auto-filling.
1114 Such characters have value t in this table.  */);
1115   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1116   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1117   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1118
1119   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1120                doc: /*
1121 A char-table for width (columns) of each character.  */);
1122   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1123   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1124   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1125                         make_number (4));
1126
1127   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1128                doc: /* A char-table for direction of each character.  */);
1129   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1130
1131   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1132                doc: /* A char-table for each printable character.  */);
1133   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1134   Fset_char_table_range (Vprintable_chars,
1135                          Fcons (make_number (32), make_number (126)), Qt);
1136   Fset_char_table_range (Vprintable_chars,
1137                          Fcons (make_number (160),
1138                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1139
1140   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1141                doc: /* Char table of script symbols.
1142 It has one extra slot whose value is a list of script symbols.  */);
1143
1144   /* Intern this now in case it isn't already done.
1145      Setting this variable twice is harmless.
1146      But don't staticpro it here--that is done in alloc.c.  */
1147   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1148   DEFSYM (Qchar_script_table, "char-script-table");
1149   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1150   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1151
1152   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1153                doc: /* Alist of scripts vs the representative characters.
1154 Each element is a cons (SCRIPT . CHARS).
1155 SCRIPT is a symbol representing a script or a subgroup of a script.
1156 CHARS is a list or a vector of characters.
1157 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1158 If it is a vector, one of the characters in the vector is necessary.
1159 This variable is used to find a font for a specific script.  */);
1160   Vscript_representative_chars = Qnil;
1161
1162   DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1163                doc: /* Char table of Unicode's "General Category".
1164 All Unicode characters have one of the following values (symbol):
1165   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1166   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1167 See The Unicode Standard for the meaning of those values.  */);
1168   /* The correct char-table is setup in characters.el.  */
1169   Vunicode_category_table = Qnil;
1170 }
1171
1172 #endif /* emacs */
1173
1174 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1175    (do not change this comment) */