src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include "lisp.h"
  38 #include "character.h"
  39 #include "buffer.h"
  40 #include "charset.h"
  41 #include "composite.h"
  42 #include "disptab.h"
  43
  44 #else  /* not emacs */
  45
  46 #include "mulelib.h"
  47
  48 #endif /* emacs */
  49
  50 Lisp_Object Qcharacterp;
  51
  52 /* Vector of translation table ever defined.
  53    ID of a translation table is used to index this vector.  */
  54 Lisp_Object Vtranslation_table_vector;
  55
  56 /* A char-table for characters which may invoke auto-filling.  */
  57 Lisp_Object Vauto_fill_chars;
  58
  59 Lisp_Object Qauto_fill_chars;
  60
  61 /* Char-table of information about which character to unify to which
  62    Unicode character.  */
  63 Lisp_Object Vchar_unify_table;
  64
  65 /* A char-table.  An element is non-nil iff the corresponding
  66    character has a printable glyph.  */
  67 Lisp_Object Vprintable_chars;
  68
  69 /* A char-table.  An elemnent is a column-width of the corresponding
  70    character.  */
  71 Lisp_Object Vchar_width_table;
  72
  73 /* A char-table.  An element is a symbol indicating the direction
  74    property of corresponding character.  */
  75 Lisp_Object Vchar_direction_table;
  76
  77 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  78 unsigned char *_fetch_multibyte_char_p;
  79
  80 /* Char table of scripts.  */
  81 Lisp_Object Vchar_script_table;
  82
  83 /* Alist of scripts vs representative characters.  */
  84 Lisp_Object Vscript_representative_chars;
  85
  86 static Lisp_Object Qchar_script_table;
  87
  88 /* Mapping table from unibyte chars to multibyte chars.  */
  89 int unibyte_to_multibyte_table[256];
  90
  91 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  92    char.  */
  93 char unibyte_has_multibyte_table[256];
  94
  95 \f
  96
  97 /* If character code C has modifier masks, reflect them to the
  98    character code if possible.  Return the resulting code.  */
  99
 100 int
 101 char_resolve_modifier_mask (c)
 102      int c;
 103 {
 104   /* A non-ASCII character can't reflect modifier bits to the code.  */
 105   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 106     return c;
 107
 108   /* For Meta, Shift, and Control modifiers, we need special care.  */
 109   if (c & CHAR_SHIFT)
 110     {
 111       /* Shift modifier is valid only with [A-Za-z].  */
 112       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 113         c &= ~CHAR_SHIFT;
 114       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 115         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 116       /* Shift modifier for control characters and SPC is ignored.  */
 117       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 118         c &= ~CHAR_SHIFT;
 119     }
 120   if (c & CHAR_CTL)
 121     {
 122       /* Simulate the code in lread.c.  */
 123       /* Allow `\C- ' and `\C-?'.  */
 124       if ((c & 0377) == ' ')
 125         c &= ~0177 & ~ CHAR_CTL;
 126       else if ((c & 0377) == '?')
 127         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 128       /* ASCII control chars are made from letters (both cases),
 129          as well as the non-letters within 0100...0137.  */
 130       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 131         c &= (037 | (~0177 & ~CHAR_CTL));
 132       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 133         c &= (037 | (~0177 & ~CHAR_CTL));
 134     }
 135   if (c & CHAR_META)
 136     {
 137       /* Move the meta bit to the right place for a string.  */
 138       c = (c & ~CHAR_META) | 0x80;
 139     }
 140
 141   return c;
 142 }
 143
 144
 145 /* Store multibyte form of character C at P.  If C has modifier bits,
 146    handle them appropriately.  */
 147
 148 int
 149 char_string (c, p)
 150      unsigned c;
 151      unsigned char *p;
 152 {
 153   int bytes;
 154
 155   if (c & CHAR_MODIFIER_MASK)
 156     {
 157       c = (unsigned) char_resolve_modifier_mask ((int) c);
 158       /* If C still has any modifier bits, just ignore it.  */
 159       c &= ~CHAR_MODIFIER_MASK;
 160     }
 161
 162   MAYBE_UNIFY_CHAR (c);
 163
 164   if (c <= MAX_3_BYTE_CHAR)
 165     {
 166       bytes = CHAR_STRING (c, p);
 167     }
 168   else if (c <= MAX_4_BYTE_CHAR)
 169     {
 170       p[0] = (0xF0 | (c >> 18));
 171       p[1] = (0x80 | ((c >> 12) & 0x3F));
 172       p[2] = (0x80 | ((c >> 6) & 0x3F));
 173       p[3] = (0x80 | (c & 0x3F));
 174       bytes = 4;
 175     }
 176   else if (c <= MAX_5_BYTE_CHAR)
 177     {
 178       p[0] = 0xF8;
 179       p[1] = (0x80 | ((c >> 18) & 0x0F));
 180       p[2] = (0x80 | ((c >> 12) & 0x3F));
 181       p[3] = (0x80 | ((c >> 6) & 0x3F));
 182       p[4] = (0x80 | (c & 0x3F));
 183       bytes = 5;
 184     }
 185   else if (c <= MAX_CHAR)
 186     {
 187       c = CHAR_TO_BYTE8 (c);
 188       bytes = BYTE8_STRING (c, p);
 189     }
 190   else
 191     error ("Invalid character: %d", c);
 192
 193   return bytes;
 194 }
 195
 196
 197 /* Return a character whose multibyte form is at P.  Set LEN is not
 198    NULL, it must be a pointer to integer.  In that case, set *LEN to
 199    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 200    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 201    the ending address (i.e. the starting address of the next
 202    character) of the multibyte form.  */
 203
 204 int
 205 string_char (p, advanced, len)
 206      const unsigned char *p;
 207      const unsigned char **advanced;
 208      int *len;
 209 {
 210   int c;
 211   const unsigned char *saved_p = p;
 212
 213   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 214     {
 215       c = STRING_CHAR_ADVANCE (p);
 216     }
 217   else if (! (*p & 0x08))
 218     {
 219       c = ((((p)[0] & 0xF) << 18)
 220            | (((p)[1] & 0x3F) << 12)
 221            | (((p)[2] & 0x3F) << 6)
 222            | ((p)[3] & 0x3F));
 223       p += 4;
 224     }
 225   else
 226     {
 227       c = ((((p)[1] & 0x3F) << 18)
 228            | (((p)[2] & 0x3F) << 12)
 229            | (((p)[3] & 0x3F) << 6)
 230            | ((p)[4] & 0x3F));
 231       p += 5;
 232     }
 233
 234   MAYBE_UNIFY_CHAR (c);
 235
 236   if (len)
 237     *len = p - saved_p;
 238   if (advanced)
 239     *advanced = p;
 240   return c;
 241 }
 242
 243
 244 /* Translate character C by translation table TABLE.  If C is
 245    negative, translate a character specified by CHARSET and CODE.  If
 246    no translation is found in TABLE, return the untranslated
 247    character.  If TABLE is a list, elements are char tables.  In this
 248    case, translace C by all tables.  */
 249
 250 int
 251 translate_char (table, c)
 252      Lisp_Object table;
 253      int c;
 254 {
 255   if (CHAR_TABLE_P (table))
 256     {
 257       Lisp_Object ch;
 258
 259       ch = CHAR_TABLE_REF (table, c);
 260       if (CHARACTERP (ch))
 261         c = XINT (ch);
 262     }
 263   else
 264     {
 265       for (; CONSP (table); table = XCDR (table))
 266         c = translate_char (XCAR (table), c);
 267     }
 268   return c;
 269 }
 270
 271 /* Convert the multibyte character C to unibyte 8-bit character based
 272    on the current value of charset_unibyte.  If dimension of
 273    charset_unibyte is more than one, return (C & 0xFF).
 274
 275    The argument REV_TBL is now ignored.  It will be removed in the
 276    future.  */
 277
 278 int
 279 multibyte_char_to_unibyte (c, rev_tbl)
 280      int c;
 281      Lisp_Object rev_tbl;
 282 {
 283   struct charset *charset;
 284   unsigned c1;
 285
 286   if (CHAR_BYTE8_P (c))
 287     return CHAR_TO_BYTE8 (c);
 288   charset = CHARSET_FROM_ID (charset_unibyte);
 289   c1 = ENCODE_CHAR (charset, c);
 290   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 291 }
 292
 293 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 294    by charset_unibyte.  */
 295
 296 int
 297 multibyte_char_to_unibyte_safe (c)
 298      int c;
 299 {
 300   struct charset *charset;
 301   unsigned c1;
 302
 303   if (CHAR_BYTE8_P (c))
 304     return CHAR_TO_BYTE8 (c);
 305   charset = CHARSET_FROM_ID (charset_unibyte);
 306   c1 = ENCODE_CHAR (charset, c);
 307   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
 308 }
 309
 310 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 311        doc: /* Return non-nil if OBJECT is a character.  */)
 312      (object, ignore)
 313      Lisp_Object object, ignore;
 314 {
 315   return (CHARACTERP (object) ? Qt : Qnil);
 316 }
 317
 318 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 319        doc: /* Return the character of the maximum code.  */)
 320      ()
 321 {
 322   return make_number (MAX_CHAR);
 323 }
 324
 325 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 326        Sunibyte_char_to_multibyte, 1, 1, 0,
 327        doc: /* Convert the byte CH to multibyte character.  */)
 328      (ch)
 329      Lisp_Object ch;
 330 {
 331   int c;
 332   struct charset *charset;
 333
 334   CHECK_CHARACTER (ch);
 335   c = XFASTINT (ch);
 336   if (c >= 0400)
 337     error ("Invalid unibyte character: %d", c);
 338   charset = CHARSET_FROM_ID (charset_unibyte);
 339   c = DECODE_CHAR (charset, c);
 340   if (c < 0)
 341     c = BYTE8_TO_CHAR (XFASTINT (ch));
 342   return make_number (c);
 343 }
 344
 345 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 346        Smultibyte_char_to_unibyte, 1, 1, 0,
 347        doc: /* Convert the multibyte character CH to a byte.
 348 If the multibyte character does not represent a byte, return -1.  */)
 349      (ch)
 350      Lisp_Object ch;
 351 {
 352   int cm;
 353
 354   CHECK_CHARACTER (ch);
 355   cm = XFASTINT (ch);
 356   if (cm < 256)
 357     /* Can't distinguish a byte read from a unibyte buffer from
 358        a latin1 char, so let's let it slide.  */
 359     return ch;
 360   else
 361     {
 362       int cu = CHAR_TO_BYTE_SAFE (cm);
 363       return make_number (cu);
 364     }
 365 }
 366
 367 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 368        doc: /* Return 1 regardless of the argument CHAR.
 369 This is now an obsolete function.  We keep it just for backward compatibility.
 370 usage: (char-bytes CHAR)  */)
 371      (ch)
 372      Lisp_Object ch;
 373 {
 374   CHECK_CHARACTER (ch);
 375   return make_number (1);
 376 }
 377
 378 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 379        doc: /* Return width of CHAR when displayed in the current buffer.
 380 The width is measured by how many columns it occupies on the screen.
 381 Tab is taken to occupy `tab-width' columns.
 382 usage: (char-width CHAR)  */)
 383      (ch)
 384        Lisp_Object ch;
 385 {
 386   Lisp_Object disp;
 387   int c, width;
 388   struct Lisp_Char_Table *dp = buffer_display_table ();
 389
 390   CHECK_CHARACTER (ch);
 391   c = XINT (ch);
 392
 393   /* Get the way the display table would display it.  */
 394   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 395
 396   if (VECTORP (disp))
 397     width = ASIZE (disp);
 398   else
 399     width = CHAR_WIDTH (c);
 400
 401   return make_number (width);
 402 }
 403
 404 /* Return width of string STR of length LEN when displayed in the
 405    current buffer.  The width is measured by how many columns it
 406    occupies on the screen.  If PRECISION > 0, return the width of
 407    longest substring that doesn't exceed PRECISION, and set number of
 408    characters and bytes of the substring in *NCHARS and *NBYTES
 409    respectively.  */
 410
 411 int
 412 c_string_width (str, len, precision, nchars, nbytes)
 413      const unsigned char *str;
 414      int precision, *nchars, *nbytes;
 415 {
 416   int i = 0, i_byte = 0;
 417   int width = 0;
 418   struct Lisp_Char_Table *dp = buffer_display_table ();
 419
 420   while (i_byte < len)
 421     {
 422       int bytes, thiswidth;
 423       Lisp_Object val;
 424       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 425
 426       if (dp)
 427         {
 428           val = DISP_CHAR_VECTOR (dp, c);
 429           if (VECTORP (val))
 430             thiswidth = XVECTOR (val)->size;
 431           else
 432             thiswidth = CHAR_WIDTH (c);
 433         }
 434       else
 435         {
 436           thiswidth = CHAR_WIDTH (c);
 437         }
 438
 439       if (precision > 0
 440           && (width + thiswidth > precision))
 441         {
 442           *nchars = i;
 443           *nbytes = i_byte;
 444           return width;
 445         }
 446       i++;
 447       i_byte += bytes;
 448       width += thiswidth;
 449   }
 450
 451   if (precision > 0)
 452     {
 453       *nchars = i;
 454       *nbytes = i_byte;
 455     }
 456
 457   return width;
 458 }
 459
 460 /* Return width of string STR of length LEN when displayed in the
 461    current buffer.  The width is measured by how many columns it
 462    occupies on the screen.  */
 463
 464 int
 465 strwidth (str, len)
 466      unsigned char *str;
 467      int len;
 468 {
 469   return c_string_width (str, len, -1, NULL, NULL);
 470 }
 471
 472 /* Return width of Lisp string STRING when displayed in the current
 473    buffer.  The width is measured by how many columns it occupies on
 474    the screen while paying attention to compositions.  If PRECISION >
 475    0, return the width of longest substring that doesn't exceed
 476    PRECISION, and set number of characters and bytes of the substring
 477    in *NCHARS and *NBYTES respectively.  */
 478
 479 int
 480 lisp_string_width (string, precision, nchars, nbytes)
 481      Lisp_Object string;
 482      int precision, *nchars, *nbytes;
 483 {
 484   int len = SCHARS (string);
 485   /* This set multibyte to 0 even if STRING is multibyte when it
 486      contains only ascii and eight-bit-graphic, but that's
 487      intentional.  */
 488   int multibyte = len < SBYTES (string);
 489   unsigned char *str = SDATA (string);
 490   int i = 0, i_byte = 0;
 491   int width = 0;
 492   struct Lisp_Char_Table *dp = buffer_display_table ();
 493
 494   while (i < len)
 495     {
 496       int chars, bytes, thiswidth;
 497       Lisp_Object val;
 498       int cmp_id;
 499       EMACS_INT ignore, end;
 500
 501       if (find_composition (i, -1, &ignore, &end, &val, string)
 502           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 503               >= 0))
 504         {
 505           thiswidth = composition_table[cmp_id]->width;
 506           chars = end - i;
 507           bytes = string_char_to_byte (string, end) - i_byte;
 508         }
 509       else
 510         {
 511           int c;
 512
 513           if (multibyte)
 514             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 515           else
 516             c = str[i_byte], bytes = 1;
 517           chars = 1;
 518           if (dp)
 519             {
 520               val = DISP_CHAR_VECTOR (dp, c);
 521               if (VECTORP (val))
 522                 thiswidth = XVECTOR (val)->size;
 523               else
 524                 thiswidth = CHAR_WIDTH (c);
 525             }
 526           else
 527             {
 528               thiswidth = CHAR_WIDTH (c);
 529             }
 530         }
 531
 532       if (precision > 0
 533           && (width + thiswidth > precision))
 534         {
 535           *nchars = i;
 536           *nbytes = i_byte;
 537           return width;
 538         }
 539       i += chars;
 540       i_byte += bytes;
 541       width += thiswidth;
 542   }
 543
 544   if (precision > 0)
 545     {
 546       *nchars = i;
 547       *nbytes = i_byte;
 548     }
 549
 550   return width;
 551 }
 552
 553 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 554        doc: /* Return width of STRING when displayed in the current buffer.
 555 Width is measured by how many columns it occupies on the screen.
 556 When calculating width of a multibyte character in STRING,
 557 only the base leading-code is considered; the validity of
 558 the following bytes is not checked.  Tabs in STRING are always
 559 taken to occupy `tab-width' columns.
 560 usage: (string-width STRING)  */)
 561      (str)
 562      Lisp_Object str;
 563 {
 564   Lisp_Object val;
 565
 566   CHECK_STRING (str);
 567   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 568   return val;
 569 }
 570
 571 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 572        doc: /* Return the direction of CHAR.
 573 The returned value is 0 for left-to-right and 1 for right-to-left.
 574 usage: (char-direction CHAR)  */)
 575      (ch)
 576      Lisp_Object ch;
 577 {
 578   int c;
 579
 580   CHECK_CHARACTER (ch);
 581   c = XINT (ch);
 582   return CHAR_TABLE_REF (Vchar_direction_table, c);
 583 }
 584
 585 /* Return the number of characters in the NBYTES bytes at PTR.
 586    This works by looking at the contents and checking for multibyte
 587    sequences while assuming that there's no invalid sequence.
 588    However, if the current buffer has enable-multibyte-characters =
 589    nil, we treat each byte as a character.  */
 590
 591 EMACS_INT
 592 chars_in_text (ptr, nbytes)
 593      const unsigned char *ptr;
 594      EMACS_INT nbytes;
 595 {
 596   /* current_buffer is null at early stages of Emacs initialization.  */
 597   if (current_buffer == 0
 598       || NILP (current_buffer->enable_multibyte_characters))
 599     return nbytes;
 600
 601   return multibyte_chars_in_text (ptr, nbytes);
 602 }
 603
 604 /* Return the number of characters in the NBYTES bytes at PTR.
 605    This works by looking at the contents and checking for multibyte
 606    sequences while assuming that there's no invalid sequence.  It
 607    ignores enable-multibyte-characters.  */
 608
 609 EMACS_INT
 610 multibyte_chars_in_text (ptr, nbytes)
 611      const unsigned char *ptr;
 612      EMACS_INT nbytes;
 613 {
 614   const unsigned char *endp = ptr + nbytes;
 615   int chars = 0;
 616
 617   while (ptr < endp)
 618     {
 619       int len = MULTIBYTE_LENGTH (ptr, endp);
 620
 621       if (len == 0)
 622         abort ();
 623       ptr += len;
 624       chars++;
 625     }
 626
 627   return chars;
 628 }
 629
 630 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 631    characters and bytes in it, and store them in *NCHARS and *NBYTES
 632    respectively.  On counting bytes, pay attention to that 8-bit
 633    characters not constructing a valid multibyte sequence are
 634    represented by 2-byte in a multibyte text.  */
 635
 636 void
 637 parse_str_as_multibyte (str, len, nchars, nbytes)
 638      const unsigned char *str;
 639      int len, *nchars, *nbytes;
 640 {
 641   const unsigned char *endp = str + len;
 642   int n, chars = 0, bytes = 0;
 643
 644   if (len >= MAX_MULTIBYTE_LENGTH)
 645     {
 646       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 647       while (str < adjusted_endp)
 648         {
 649           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 650             str += n, bytes += n;
 651           else
 652             str++, bytes += 2;
 653           chars++;
 654         }
 655     }
 656   while (str < endp)
 657     {
 658       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 659         str += n, bytes += n;
 660       else
 661         str++, bytes += 2;
 662       chars++;
 663     }
 664
 665   *nchars = chars;
 666   *nbytes = bytes;
 667   return;
 668 }
 669
 670 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 671    It actually converts only such 8-bit characters that don't contruct
 672    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 673    NCHARS is nonzero, set *NCHARS to the number of characters in the
 674    text.  It is assured that we can use LEN bytes at STR as a work
 675    area and that is enough.  Return the number of bytes of the
 676    resulting text.  */
 677
 678 int
 679 str_as_multibyte (str, len, nbytes, nchars)
 680      unsigned char *str;
 681      int len, nbytes, *nchars;
 682 {
 683   unsigned char *p = str, *endp = str + nbytes;
 684   unsigned char *to;
 685   int chars = 0;
 686   int n;
 687
 688   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 689     {
 690       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 691       while (p < adjusted_endp
 692              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 693         p += n, chars++;
 694     }
 695   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 696     p += n, chars++;
 697   if (nchars)
 698     *nchars = chars;
 699   if (p == endp)
 700     return nbytes;
 701
 702   to = p;
 703   nbytes = endp - p;
 704   endp = str + len;
 705   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 706   p = endp - nbytes;
 707
 708   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 709     {
 710       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 711       while (p < adjusted_endp)
 712         {
 713           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 714             {
 715               while (n--)
 716                 *to++ = *p++;
 717             }
 718           else
 719             {
 720               int c = *p++;
 721               c = BYTE8_TO_CHAR (c);
 722               to += CHAR_STRING (c, to);
 723             }
 724         }
 725       chars++;
 726     }
 727   while (p < endp)
 728     {
 729       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 730         {
 731           while (n--)
 732             *to++ = *p++;
 733         }
 734       else
 735         {
 736           int c = *p++;
 737           c = BYTE8_TO_CHAR (c);
 738           to += CHAR_STRING (c, to);
 739         }
 740       chars++;
 741     }
 742   if (nchars)
 743     *nchars = chars;
 744   return (to - str);
 745 }
 746
 747 /* Parse unibyte string at STR of LEN bytes, and return the number of
 748    bytes it may ocupy when converted to multibyte string by
 749    `str_to_multibyte'.  */
 750
 751 int
 752 parse_str_to_multibyte (str, len)
 753      unsigned char *str;
 754      int len;
 755 {
 756   unsigned char *endp = str + len;
 757   int bytes;
 758
 759   for (bytes = 0; str < endp; str++)
 760     bytes += (*str < 0x80) ? 1 : 2;
 761   return bytes;
 762 }
 763
 764
 765 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 766    that contains the same single-byte characters.  It actually
 767    converts all 8-bit characters to multibyte forms.  It is assured
 768    that we can use LEN bytes at STR as a work area and that is
 769    enough.  */
 770
 771 int
 772 str_to_multibyte (str, len, bytes)
 773      unsigned char *str;
 774      int len, bytes;
 775 {
 776   unsigned char *p = str, *endp = str + bytes;
 777   unsigned char *to;
 778
 779   while (p < endp && *p < 0x80) p++;
 780   if (p == endp)
 781     return bytes;
 782   to = p;
 783   bytes = endp - p;
 784   endp = str + len;
 785   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 786   p = endp - bytes;
 787   while (p < endp)
 788     {
 789       int c = *p++;
 790
 791       if (c >= 0x80)
 792         c = BYTE8_TO_CHAR (c);
 793       to += CHAR_STRING (c, to);
 794     }
 795   return (to - str);
 796 }
 797
 798 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 799    actually converts characters in the range 0x80..0xFF to
 800    unibyte.  */
 801
 802 int
 803 str_as_unibyte (str, bytes)
 804      unsigned char *str;
 805      int bytes;
 806 {
 807   const unsigned char *p = str, *endp = str + bytes;
 808   unsigned char *to;
 809   int c, len;
 810
 811   while (p < endp)
 812     {
 813       c = *p;
 814       len = BYTES_BY_CHAR_HEAD (c);
 815       if (CHAR_BYTE8_HEAD_P (c))
 816         break;
 817       p += len;
 818     }
 819   to = str + (p - str);
 820   while (p < endp)
 821     {
 822       c = *p;
 823       len = BYTES_BY_CHAR_HEAD (c);
 824       if (CHAR_BYTE8_HEAD_P (c))
 825         {
 826           c = STRING_CHAR_ADVANCE (p);
 827           *to++ = CHAR_TO_BYTE8 (c);
 828         }
 829       else
 830         {
 831           while (len--) *to++ = *p++;
 832         }
 833     }
 834   return (to - str);
 835 }
 836
 837 /* Convert eight-bit chars in SRC (in multibyte form) to the
 838    corresponding byte and store in DST.  CHARS is the number of
 839    characters in SRC.  The value is the number of bytes stored in DST.
 840    Usually, the value is the same as CHARS, but is less than it if SRC
 841    contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
 842    is nonzero, a Latin-1 character is accepted and converted to a byte
 843    of that character code.
 844    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 845
 846 EMACS_INT
 847 str_to_unibyte (src, dst, chars, accept_latin_1)
 848      const unsigned char *src;
 849      unsigned char *dst;
 850      EMACS_INT chars;
 851      int accept_latin_1;
 852 {
 853   EMACS_INT i;
 854
 855   for (i = 0; i < chars; i++)
 856     {
 857       int c = STRING_CHAR_ADVANCE (src);
 858
 859       if (CHAR_BYTE8_P (c))
 860         c = CHAR_TO_BYTE8 (c);
 861       else if (! ASCII_CHAR_P (c)
 862                && (! accept_latin_1 || c >= 0x100))
 863         return i;
 864       *dst++ = c;
 865     }
 866   return i;
 867 }
 868
 869
 870 int
 871 string_count_byte8 (string)
 872      Lisp_Object string;
 873 {
 874   int multibyte = STRING_MULTIBYTE (string);
 875   int nbytes = SBYTES (string);
 876   unsigned char *p = SDATA (string);
 877   unsigned char *pend = p + nbytes;
 878   int count = 0;
 879   int c, len;
 880
 881   if (multibyte)
 882     while (p < pend)
 883       {
 884         c = *p;
 885         len = BYTES_BY_CHAR_HEAD (c);
 886
 887         if (CHAR_BYTE8_HEAD_P (c))
 888           count++;
 889         p += len;
 890       }
 891   else
 892     while (p < pend)
 893       {
 894         if (*p++ >= 0x80)
 895           count++;
 896       }
 897   return count;
 898 }
 899
 900
 901 Lisp_Object
 902 string_escape_byte8 (string)
 903      Lisp_Object string;
 904 {
 905   int nchars = SCHARS (string);
 906   int nbytes = SBYTES (string);
 907   int multibyte = STRING_MULTIBYTE (string);
 908   int byte8_count;
 909   const unsigned char *src, *src_end;
 910   unsigned char *dst;
 911   Lisp_Object val;
 912   int c, len;
 913
 914   if (multibyte && nchars == nbytes)
 915     return string;
 916
 917   byte8_count = string_count_byte8 (string);
 918
 919   if (byte8_count == 0)
 920     return string;
 921
 922   if (multibyte)
 923     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 924     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 925                                         nbytes + byte8_count * 2);
 926   else
 927     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 928     val = make_uninit_string (nbytes + byte8_count * 3);
 929
 930   src = SDATA (string);
 931   src_end = src + nbytes;
 932   dst = SDATA (val);
 933   if (multibyte)
 934     while (src < src_end)
 935       {
 936         c = *src;
 937         len = BYTES_BY_CHAR_HEAD (c);
 938
 939         if (CHAR_BYTE8_HEAD_P (c))
 940           {
 941             c = STRING_CHAR_ADVANCE (src);
 942             c = CHAR_TO_BYTE8 (c);
 943             sprintf ((char *) dst, "\\%03o", c);
 944             dst += 4;
 945           }
 946         else
 947           while (len--) *dst++ = *src++;
 948       }
 949   else
 950     while (src < src_end)
 951       {
 952         c = *src++;
 953         if (c >= 0x80)
 954           {
 955             sprintf ((char *) dst, "\\%03o", c);
 956             dst += 4;
 957           }
 958         else
 959           *dst++ = c;
 960       }
 961   return val;
 962 }
 963
 964 \f
 965 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 966        doc: /*
 967 Concatenate all the argument characters and make the result a string.
 968 usage: (string &rest CHARACTERS)  */)
 969      (n, args)
 970      int n;
 971      Lisp_Object *args;
 972 {
 973   int i;
 974   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 975   unsigned char *p = buf;
 976   int c;
 977
 978   for (i = 0; i < n; i++)
 979     {
 980       CHECK_CHARACTER (args[i]);
 981       c = XINT (args[i]);
 982       p += CHAR_STRING (c, p);
 983     }
 984
 985   return make_string_from_bytes ((char *) buf, n, p - buf);
 986 }
 987
 988 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 989        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 990 usage: (unibyte-string &rest BYTES)  */)
 991      (n, args)
 992      int n;
 993      Lisp_Object *args;
 994 {
 995   int i;
 996   unsigned char *buf = (unsigned char *) alloca (n);
 997   unsigned char *p = buf;
 998   unsigned c;
 999
1000   for (i = 0; i < n; i++)
1001     {
1002       CHECK_NATNUM (args[i]);
1003       c = XFASTINT (args[i]);
1004       if (c >= 256)
1005         args_out_of_range_3 (args[i], make_number (0), make_number (255));
1006       *p++ = c;
1007     }
1008
1009   return make_string_from_bytes ((char *) buf, n, p - buf);
1010 }
1011
1012 DEFUN ("char-resolve-modifers", Fchar_resolve_modifiers,
1013        Schar_resolve_modifiers, 1, 1, 0,
1014        doc: /* Resolve modifiers in the character CHAR.
1015 The value is a character with modifiers resolved into the character
1016 code.  Unresolved modifiers are kept in the value.
1017 usage: (char-resolve-modifers CHAR)  */)
1018      (character)
1019      Lisp_Object character;
1020 {
1021   int c;
1022
1023   CHECK_NUMBER (character);
1024   c = XINT (character);
1025   return make_number (char_resolve_modifier_mask (c));
1026 }
1027
1028 void
1029 init_character_once ()
1030 {
1031 }
1032
1033 #ifdef emacs
1034
1035 void
1036 syms_of_character ()
1037 {
1038   DEFSYM (Qcharacterp, "characterp");
1039   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1040
1041   staticpro (&Vchar_unify_table);
1042   Vchar_unify_table = Qnil;
1043
1044   defsubr (&Smax_char);
1045   defsubr (&Scharacterp);
1046   defsubr (&Sunibyte_char_to_multibyte);
1047   defsubr (&Smultibyte_char_to_unibyte);
1048   defsubr (&Schar_bytes);
1049   defsubr (&Schar_width);
1050   defsubr (&Sstring_width);
1051   defsubr (&Schar_direction);
1052   defsubr (&Sstring);
1053   defsubr (&Sunibyte_string);
1054   defsubr (&Schar_resolve_modifiers);
1055
1056   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1057                doc: /*
1058 Vector recording all translation tables ever defined.
1059 Each element is a pair (SYMBOL . TABLE) relating the table to the
1060 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1061   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1062
1063   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1064                doc: /*
1065 A char-table for characters which invoke auto-filling.
1066 Such characters have value t in this table.  */);
1067   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1068   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1069   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1070
1071   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1072                doc: /*
1073 A char-table for width (columns) of each character.  */);
1074   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1075   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1076   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1077                         make_number (4));
1078
1079   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1080                doc: /* A char-table for direction of each character.  */);
1081   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1082
1083   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1084                doc: /* A char-table for each printable character.  */);
1085   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1086   Fset_char_table_range (Vprintable_chars,
1087                          Fcons (make_number (32), make_number (126)), Qt);
1088   Fset_char_table_range (Vprintable_chars,
1089                          Fcons (make_number (160),
1090                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1091
1092   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1093                doc: /* Char table of script symbols.
1094 It has one extra slot whose value is a list of script symbols.  */);
1095
1096   /* Intern this now in case it isn't already done.
1097      Setting this variable twice is harmless.
1098      But don't staticpro it here--that is done in alloc.c.  */
1099   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1100   DEFSYM (Qchar_script_table, "char-script-table");
1101   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1102   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1103
1104   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1105                doc: /* Alist of scripts vs the representative characters.  */);
1106   Vscript_representative_chars = Qnil;
1107 }
1108
1109 #endif /* emacs */
1110
1111 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1112    (do not change this comment) */