src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include "lisp.h"
  38 #include "character.h"
  39 #include "buffer.h"
  40 #include "charset.h"
  41 #include "composite.h"
  42 #include "disptab.h"
  43
  44 #else  /* not emacs */
  45
  46 #include "mulelib.h"
  47
  48 #endif /* emacs */
  49
  50 Lisp_Object Qcharacterp;
  51
  52 /* Vector of translation table ever defined.
  53    ID of a translation table is used to index this vector.  */
  54 Lisp_Object Vtranslation_table_vector;
  55
  56 /* A char-table for characters which may invoke auto-filling.  */
  57 Lisp_Object Vauto_fill_chars;
  58
  59 Lisp_Object Qauto_fill_chars;
  60
  61 /* Char-table of information about which character to unify to which
  62    Unicode character.  */
  63 Lisp_Object Vchar_unify_table;
  64
  65 /* A char-table.  An element is non-nil iff the corresponding
  66    character has a printable glyph.  */
  67 Lisp_Object Vprintable_chars;
  68
  69 /* A char-table.  An elemnent is a column-width of the corresponding
  70    character.  */
  71 Lisp_Object Vchar_width_table;
  72
  73 /* A char-table.  An element is a symbol indicating the direction
  74    property of corresponding character.  */
  75 Lisp_Object Vchar_direction_table;
  76
  77 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  78 unsigned char *_fetch_multibyte_char_p;
  79
  80 /* Char table of scripts.  */
  81 Lisp_Object Vchar_script_table;
  82
  83 /* Alist of scripts vs representative characters.  */
  84 Lisp_Object Vscript_representative_chars;
  85
  86 static Lisp_Object Qchar_script_table;
  87
  88 /* Mapping table from unibyte chars to multibyte chars.  */
  89 int unibyte_to_multibyte_table[256];
  90
  91 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  92    char.  */
  93 char unibyte_has_multibyte_table[256];
  94
  95 \f
  96
  97 /* If character code C has modifier masks, reflect them to the
  98    character code if possible.  Return the resulting code.  */
  99
 100 int
 101 char_resolve_modifier_mask (c)
 102      int c;
 103 {
 104   /* A non-ASCII character can't reflect modifier bits to the code.  */
 105   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 106     return c;
 107
 108   /* For Meta, Shift, and Control modifiers, we need special care.  */
 109   if (c & CHAR_SHIFT)
 110     {
 111       /* Shift modifier is valid only with [A-Za-z].  */
 112       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 113         c &= ~CHAR_SHIFT;
 114       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 115         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 116       /* Shift modifier for control characters and SPC is ignored.  */
 117       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 118         c &= ~CHAR_SHIFT;
 119     }
 120   if (c & CHAR_CTL)
 121     {
 122       /* Simulate the code in lread.c.  */
 123       /* Allow `\C- ' and `\C-?'.  */
 124       if ((c & 0377) == ' ')
 125         c &= ~0177 & ~ CHAR_CTL;
 126       else if ((c & 0377) == '?')
 127         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 128       /* ASCII control chars are made from letters (both cases),
 129          as well as the non-letters within 0100...0137.  */
 130       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 131         c &= (037 | (~0177 & ~CHAR_CTL));
 132       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 133         c &= (037 | (~0177 & ~CHAR_CTL));
 134     }
 135   if (c & CHAR_META)
 136     {
 137       /* Move the meta bit to the right place for a string.  */
 138       c = (c & ~CHAR_META) | 0x80;
 139     }
 140
 141   return c;
 142 }
 143
 144
 145 /* Store multibyte form of character C at P.  If C has modifier bits,
 146    handle them appropriately.  */
 147
 148 int
 149 char_string (c, p)
 150      unsigned c;
 151      unsigned char *p;
 152 {
 153   int bytes;
 154
 155   if (c & CHAR_MODIFIER_MASK)
 156     {
 157       c = (unsigned) char_resolve_modifier_mask ((int) c);
 158       /* If C still has any modifier bits, just ignore it.  */
 159       c &= ~CHAR_MODIFIER_MASK;
 160     }
 161
 162   MAYBE_UNIFY_CHAR (c);
 163
 164   if (c <= MAX_3_BYTE_CHAR)
 165     {
 166       bytes = CHAR_STRING (c, p);
 167     }
 168   else if (c <= MAX_4_BYTE_CHAR)
 169     {
 170       p[0] = (0xF0 | (c >> 18));
 171       p[1] = (0x80 | ((c >> 12) & 0x3F));
 172       p[2] = (0x80 | ((c >> 6) & 0x3F));
 173       p[3] = (0x80 | (c & 0x3F));
 174       bytes = 4;
 175     }
 176   else if (c <= MAX_5_BYTE_CHAR)
 177     {
 178       p[0] = 0xF8;
 179       p[1] = (0x80 | ((c >> 18) & 0x0F));
 180       p[2] = (0x80 | ((c >> 12) & 0x3F));
 181       p[3] = (0x80 | ((c >> 6) & 0x3F));
 182       p[4] = (0x80 | (c & 0x3F));
 183       bytes = 5;
 184     }
 185   else if (c <= MAX_CHAR)
 186     {
 187       c = CHAR_TO_BYTE8 (c);
 188       bytes = BYTE8_STRING (c, p);
 189     }
 190   else
 191     error ("Invalid character: %d", c);
 192
 193   return bytes;
 194 }
 195
 196
 197 /* Return a character whose multibyte form is at P.  Set LEN is not
 198    NULL, it must be a pointer to integer.  In that case, set *LEN to
 199    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 200    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 201    the ending address (i.e. the starting address of the next
 202    character) of the multibyte form.  */
 203
 204 int
 205 string_char (p, advanced, len)
 206      const unsigned char *p;
 207      const unsigned char **advanced;
 208      int *len;
 209 {
 210   int c;
 211   const unsigned char *saved_p = p;
 212
 213   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 214     {
 215       c = STRING_CHAR_ADVANCE (p);
 216     }
 217   else if (! (*p & 0x08))
 218     {
 219       c = ((((p)[0] & 0xF) << 18)
 220            | (((p)[1] & 0x3F) << 12)
 221            | (((p)[2] & 0x3F) << 6)
 222            | ((p)[3] & 0x3F));
 223       p += 4;
 224     }
 225   else
 226     {
 227       c = ((((p)[1] & 0x3F) << 18)
 228            | (((p)[2] & 0x3F) << 12)
 229            | (((p)[3] & 0x3F) << 6)
 230            | ((p)[4] & 0x3F));
 231       p += 5;
 232     }
 233
 234   MAYBE_UNIFY_CHAR (c);
 235
 236   if (len)
 237     *len = p - saved_p;
 238   if (advanced)
 239     *advanced = p;
 240   return c;
 241 }
 242
 243
 244 /* Translate character C by translation table TABLE.  If C is
 245    negative, translate a character specified by CHARSET and CODE.  If
 246    no translation is found in TABLE, return the untranslated
 247    character.  If TABLE is a list, elements are char tables.  In this
 248    case, translace C by all tables.  */
 249
 250 int
 251 translate_char (table, c)
 252      Lisp_Object table;
 253      int c;
 254 {
 255   if (CHAR_TABLE_P (table))
 256     {
 257       Lisp_Object ch;
 258
 259       ch = CHAR_TABLE_REF (table, c);
 260       if (CHARACTERP (ch))
 261         c = XINT (ch);
 262     }
 263   else
 264     {
 265       for (; CONSP (table); table = XCDR (table))
 266         c = translate_char (XCAR (table), c);
 267     }
 268   return c;
 269 }
 270
 271 /* Convert the multibyte character C to unibyte 8-bit character based
 272    on the current value of charset_unibyte.  If dimension of
 273    charset_unibyte is more than one, return (C & 0xFF).
 274
 275    The argument REV_TBL is now ignored.  It will be removed in the
 276    future.  */
 277
 278 int
 279 multibyte_char_to_unibyte (c, rev_tbl)
 280      int c;
 281      Lisp_Object rev_tbl;
 282 {
 283   struct charset *charset;
 284   unsigned c1;
 285
 286   if (CHAR_BYTE8_P (c))
 287     return CHAR_TO_BYTE8 (c);
 288   charset = CHARSET_FROM_ID (charset_unibyte);
 289   c1 = ENCODE_CHAR (charset, c);
 290   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 291 }
 292
 293 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 294    by charset_unibyte.  */
 295
 296 int
 297 multibyte_char_to_unibyte_safe (c)
 298      int c;
 299 {
 300   struct charset *charset;
 301   unsigned c1;
 302
 303   if (CHAR_BYTE8_P (c))
 304     return CHAR_TO_BYTE8 (c);
 305   charset = CHARSET_FROM_ID (charset_unibyte);
 306   c1 = ENCODE_CHAR (charset, c);
 307   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
 308 }
 309
 310 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 311        doc: /* Return non-nil if OBJECT is a character.  */)
 312      (object, ignore)
 313      Lisp_Object object, ignore;
 314 {
 315   return (CHARACTERP (object) ? Qt : Qnil);
 316 }
 317
 318 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 319        doc: /* Return the character of the maximum code.  */)
 320      ()
 321 {
 322   return make_number (MAX_CHAR);
 323 }
 324
 325 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 326        Sunibyte_char_to_multibyte, 1, 1, 0,
 327        doc: /* Convert the byte CH to multibyte character.  */)
 328      (ch)
 329      Lisp_Object ch;
 330 {
 331   int c;
 332   struct charset *charset;
 333
 334   CHECK_CHARACTER (ch);
 335   c = XFASTINT (ch);
 336   if (c >= 0400)
 337     error ("Invalid unibyte character: %d", c);
 338   charset = CHARSET_FROM_ID (charset_unibyte);
 339   c = DECODE_CHAR (charset, c);
 340   if (c < 0)
 341     c = BYTE8_TO_CHAR (XFASTINT (ch));
 342   return make_number (c);
 343 }
 344
 345 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 346        Smultibyte_char_to_unibyte, 1, 1, 0,
 347        doc: /* Convert the multibyte character CH to a byte.
 348 If the multibyte character does not represent a byte, return -1.  */)
 349      (ch)
 350      Lisp_Object ch;
 351 {
 352   int cm;
 353
 354   CHECK_CHARACTER (ch);
 355   cm = XFASTINT (ch);
 356   if (cm < 256)
 357     /* Can't distinguish a byte read from a unibyte buffer from
 358        a latin1 char, so let's let it slide.  */
 359     return ch;
 360   else
 361     {
 362       int cu = CHAR_TO_BYTE8 (cm);
 363       return make_number (cu);
 364     }
 365 }
 366
 367 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 368        doc: /* Return 1 regardless of the argument CHAR.
 369 This is now an obsolete function.  We keep it just for backward compatibility.
 370 usage: (char-bytes CHAR)  */)
 371      (ch)
 372      Lisp_Object ch;
 373 {
 374   CHECK_CHARACTER (ch);
 375   return make_number (1);
 376 }
 377
 378 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 379        doc: /* Return width of CHAR when displayed in the current buffer.
 380 The width is measured by how many columns it occupies on the screen.
 381 Tab is taken to occupy `tab-width' columns.
 382 usage: (char-width CHAR)  */)
 383      (ch)
 384        Lisp_Object ch;
 385 {
 386   Lisp_Object disp;
 387   int c, width;
 388   struct Lisp_Char_Table *dp = buffer_display_table ();
 389
 390   CHECK_CHARACTER (ch);
 391   c = XINT (ch);
 392
 393   /* Get the way the display table would display it.  */
 394   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 395
 396   if (VECTORP (disp))
 397     width = ASIZE (disp);
 398   else
 399     width = CHAR_WIDTH (c);
 400
 401   return make_number (width);
 402 }
 403
 404 /* Return width of string STR of length LEN when displayed in the
 405    current buffer.  The width is measured by how many columns it
 406    occupies on the screen.  If PRECISION > 0, return the width of
 407    longest substring that doesn't exceed PRECISION, and set number of
 408    characters and bytes of the substring in *NCHARS and *NBYTES
 409    respectively.  */
 410
 411 int
 412 c_string_width (str, len, precision, nchars, nbytes)
 413      const unsigned char *str;
 414      int precision, *nchars, *nbytes;
 415 {
 416   int i = 0, i_byte = 0;
 417   int width = 0;
 418   struct Lisp_Char_Table *dp = buffer_display_table ();
 419
 420   while (i_byte < len)
 421     {
 422       int bytes, thiswidth;
 423       Lisp_Object val;
 424       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 425
 426       if (dp)
 427         {
 428           val = DISP_CHAR_VECTOR (dp, c);
 429           if (VECTORP (val))
 430             thiswidth = XVECTOR (val)->size;
 431           else
 432             thiswidth = CHAR_WIDTH (c);
 433         }
 434       else
 435         {
 436           thiswidth = CHAR_WIDTH (c);
 437         }
 438
 439       if (precision > 0
 440           && (width + thiswidth > precision))
 441         {
 442           *nchars = i;
 443           *nbytes = i_byte;
 444           return width;
 445         }
 446       i++;
 447       i_byte += bytes;
 448       width += thiswidth;
 449   }
 450
 451   if (precision > 0)
 452     {
 453       *nchars = i;
 454       *nbytes = i_byte;
 455     }
 456
 457   return width;
 458 }
 459
 460 /* Return width of string STR of length LEN when displayed in the
 461    current buffer.  The width is measured by how many columns it
 462    occupies on the screen.  */
 463
 464 int
 465 strwidth (str, len)
 466      unsigned char *str;
 467      int len;
 468 {
 469   return c_string_width (str, len, -1, NULL, NULL);
 470 }
 471
 472 /* Return width of Lisp string STRING when displayed in the current
 473    buffer.  The width is measured by how many columns it occupies on
 474    the screen while paying attention to compositions.  If PRECISION >
 475    0, return the width of longest substring that doesn't exceed
 476    PRECISION, and set number of characters and bytes of the substring
 477    in *NCHARS and *NBYTES respectively.  */
 478
 479 int
 480 lisp_string_width (string, precision, nchars, nbytes)
 481      Lisp_Object string;
 482      int precision, *nchars, *nbytes;
 483 {
 484   int len = SCHARS (string);
 485   /* This set multibyte to 0 even if STRING is multibyte when it
 486      contains only ascii and eight-bit-graphic, but that's
 487      intentional.  */
 488   int multibyte = len < SBYTES (string);
 489   unsigned char *str = SDATA (string);
 490   int i = 0, i_byte = 0;
 491   int width = 0;
 492   struct Lisp_Char_Table *dp = buffer_display_table ();
 493
 494   while (i < len)
 495     {
 496       int chars, bytes, thiswidth;
 497       Lisp_Object val;
 498       int cmp_id;
 499       EMACS_INT ignore, end;
 500
 501       if (find_composition (i, -1, &ignore, &end, &val, string)
 502           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 503               >= 0))
 504         {
 505           thiswidth = composition_table[cmp_id]->width;
 506           chars = end - i;
 507           bytes = string_char_to_byte (string, end) - i_byte;
 508         }
 509       else
 510         {
 511           int c;
 512
 513           if (multibyte)
 514             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 515           else
 516             c = str[i_byte], bytes = 1;
 517           chars = 1;
 518           if (dp)
 519             {
 520               val = DISP_CHAR_VECTOR (dp, c);
 521               if (VECTORP (val))
 522                 thiswidth = XVECTOR (val)->size;
 523               else
 524                 thiswidth = CHAR_WIDTH (c);
 525             }
 526           else
 527             {
 528               thiswidth = CHAR_WIDTH (c);
 529             }
 530         }
 531
 532       if (precision > 0
 533           && (width + thiswidth > precision))
 534         {
 535           *nchars = i;
 536           *nbytes = i_byte;
 537           return width;
 538         }
 539       i += chars;
 540       i_byte += bytes;
 541       width += thiswidth;
 542   }
 543
 544   if (precision > 0)
 545     {
 546       *nchars = i;
 547       *nbytes = i_byte;
 548     }
 549
 550   return width;
 551 }
 552
 553 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 554        doc: /* Return width of STRING when displayed in the current buffer.
 555 Width is measured by how many columns it occupies on the screen.
 556 When calculating width of a multibyte character in STRING,
 557 only the base leading-code is considered; the validity of
 558 the following bytes is not checked.  Tabs in STRING are always
 559 taken to occupy `tab-width' columns.
 560 usage: (string-width STRING)  */)
 561      (str)
 562      Lisp_Object str;
 563 {
 564   Lisp_Object val;
 565
 566   CHECK_STRING (str);
 567   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 568   return val;
 569 }
 570
 571 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 572        doc: /* Return the direction of CHAR.
 573 The returned value is 0 for left-to-right and 1 for right-to-left.
 574 usage: (char-direction CHAR)  */)
 575      (ch)
 576      Lisp_Object ch;
 577 {
 578   int c;
 579
 580   CHECK_CHARACTER (ch);
 581   c = XINT (ch);
 582   return CHAR_TABLE_REF (Vchar_direction_table, c);
 583 }
 584
 585 /* Return the number of characters in the NBYTES bytes at PTR.
 586    This works by looking at the contents and checking for multibyte
 587    sequences while assuming that there's no invalid sequence.
 588    However, if the current buffer has enable-multibyte-characters =
 589    nil, we treat each byte as a character.  */
 590
 591 EMACS_INT
 592 chars_in_text (ptr, nbytes)
 593      const unsigned char *ptr;
 594      EMACS_INT nbytes;
 595 {
 596   /* current_buffer is null at early stages of Emacs initialization.  */
 597   if (current_buffer == 0
 598       || NILP (current_buffer->enable_multibyte_characters))
 599     return nbytes;
 600
 601   return multibyte_chars_in_text (ptr, nbytes);
 602 }
 603
 604 /* Return the number of characters in the NBYTES bytes at PTR.
 605    This works by looking at the contents and checking for multibyte
 606    sequences while assuming that there's no invalid sequence.  It
 607    ignores enable-multibyte-characters.  */
 608
 609 EMACS_INT
 610 multibyte_chars_in_text (ptr, nbytes)
 611      const unsigned char *ptr;
 612      EMACS_INT nbytes;
 613 {
 614   const unsigned char *endp = ptr + nbytes;
 615   int chars = 0;
 616
 617   while (ptr < endp)
 618     {
 619       int len = MULTIBYTE_LENGTH (ptr, endp);
 620
 621       if (len == 0)
 622         abort ();
 623       ptr += len;
 624       chars++;
 625     }
 626
 627   return chars;
 628 }
 629
 630 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 631    characters and bytes in it, and store them in *NCHARS and *NBYTES
 632    respectively.  On counting bytes, pay attention to that 8-bit
 633    characters not constructing a valid multibyte sequence are
 634    represented by 2-byte in a multibyte text.  */
 635
 636 void
 637 parse_str_as_multibyte (str, len, nchars, nbytes)
 638      const unsigned char *str;
 639      int len, *nchars, *nbytes;
 640 {
 641   const unsigned char *endp = str + len;
 642   int n, chars = 0, bytes = 0;
 643
 644   if (len >= MAX_MULTIBYTE_LENGTH)
 645     {
 646       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 647       while (str < adjusted_endp)
 648         {
 649           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 650             str += n, bytes += n;
 651           else
 652             str++, bytes += 2;
 653           chars++;
 654         }
 655     }
 656   while (str < endp)
 657     {
 658       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 659         str += n, bytes += n;
 660       else
 661         str++, bytes += 2;
 662       chars++;
 663     }
 664
 665   *nchars = chars;
 666   *nbytes = bytes;
 667   return;
 668 }
 669
 670 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 671    It actually converts only such 8-bit characters that don't contruct
 672    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 673    NCHARS is nonzero, set *NCHARS to the number of characters in the
 674    text.  It is assured that we can use LEN bytes at STR as a work
 675    area and that is enough.  Return the number of bytes of the
 676    resulting text.  */
 677
 678 int
 679 str_as_multibyte (str, len, nbytes, nchars)
 680      unsigned char *str;
 681      int len, nbytes, *nchars;
 682 {
 683   unsigned char *p = str, *endp = str + nbytes;
 684   unsigned char *to;
 685   int chars = 0;
 686   int n;
 687
 688   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 689     {
 690       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 691       while (p < adjusted_endp
 692              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 693         p += n, chars++;
 694     }
 695   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 696     p += n, chars++;
 697   if (nchars)
 698     *nchars = chars;
 699   if (p == endp)
 700     return nbytes;
 701
 702   to = p;
 703   nbytes = endp - p;
 704   endp = str + len;
 705   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 706   p = endp - nbytes;
 707
 708   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 709     {
 710       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 711       while (p < adjusted_endp)
 712         {
 713           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 714             {
 715               while (n--)
 716                 *to++ = *p++;
 717             }
 718           else
 719             {
 720               int c = *p++;
 721               c = BYTE8_TO_CHAR (c);
 722               to += CHAR_STRING (c, to);
 723             }
 724         }
 725       chars++;
 726     }
 727   while (p < endp)
 728     {
 729       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 730         {
 731           while (n--)
 732             *to++ = *p++;
 733         }
 734       else
 735         {
 736           int c = *p++;
 737           c = BYTE8_TO_CHAR (c);
 738           to += CHAR_STRING (c, to);
 739         }
 740       chars++;
 741     }
 742   if (nchars)
 743     *nchars = chars;
 744   return (to - str);
 745 }
 746
 747 /* Parse unibyte string at STR of LEN bytes, and return the number of
 748    bytes it may ocupy when converted to multibyte string by
 749    `str_to_multibyte'.  */
 750
 751 int
 752 parse_str_to_multibyte (str, len)
 753      unsigned char *str;
 754      int len;
 755 {
 756   unsigned char *endp = str + len;
 757   int bytes;
 758
 759   for (bytes = 0; str < endp; str++)
 760     bytes += (*str < 0x80) ? 1 : 2;
 761   return bytes;
 762 }
 763
 764
 765 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 766    that contains the same single-byte characters.  It actually
 767    converts all 8-bit characters to multibyte forms.  It is assured
 768    that we can use LEN bytes at STR as a work area and that is
 769    enough.  */
 770
 771 int
 772 str_to_multibyte (str, len, bytes)
 773      unsigned char *str;
 774      int len, bytes;
 775 {
 776   unsigned char *p = str, *endp = str + bytes;
 777   unsigned char *to;
 778
 779   while (p < endp && *p < 0x80) p++;
 780   if (p == endp)
 781     return bytes;
 782   to = p;
 783   bytes = endp - p;
 784   endp = str + len;
 785   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 786   p = endp - bytes;
 787   while (p < endp)
 788     {
 789       int c = *p++;
 790
 791       if (c >= 0x80)
 792         c = BYTE8_TO_CHAR (c);
 793       to += CHAR_STRING (c, to);
 794     }
 795   return (to - str);
 796 }
 797
 798 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 799    actually converts characters in the range 0x80..0xFF to
 800    unibyte.  */
 801
 802 int
 803 str_as_unibyte (str, bytes)
 804      unsigned char *str;
 805      int bytes;
 806 {
 807   const unsigned char *p = str, *endp = str + bytes;
 808   unsigned char *to;
 809   int c, len;
 810
 811   while (p < endp)
 812     {
 813       c = *p;
 814       len = BYTES_BY_CHAR_HEAD (c);
 815       if (CHAR_BYTE8_HEAD_P (c))
 816         break;
 817       p += len;
 818     }
 819   to = str + (p - str);
 820   while (p < endp)
 821     {
 822       c = *p;
 823       len = BYTES_BY_CHAR_HEAD (c);
 824       if (CHAR_BYTE8_HEAD_P (c))
 825         {
 826           c = STRING_CHAR_ADVANCE (p);
 827           *to++ = CHAR_TO_BYTE8 (c);
 828         }
 829       else
 830         {
 831           while (len--) *to++ = *p++;
 832         }
 833     }
 834   return (to - str);
 835 }
 836
 837 int
 838 string_count_byte8 (string)
 839      Lisp_Object string;
 840 {
 841   int multibyte = STRING_MULTIBYTE (string);
 842   int nbytes = SBYTES (string);
 843   unsigned char *p = SDATA (string);
 844   unsigned char *pend = p + nbytes;
 845   int count = 0;
 846   int c, len;
 847
 848   if (multibyte)
 849     while (p < pend)
 850       {
 851         c = *p;
 852         len = BYTES_BY_CHAR_HEAD (c);
 853
 854         if (CHAR_BYTE8_HEAD_P (c))
 855           count++;
 856         p += len;
 857       }
 858   else
 859     while (p < pend)
 860       {
 861         if (*p++ >= 0x80)
 862           count++;
 863       }
 864   return count;
 865 }
 866
 867
 868 Lisp_Object
 869 string_escape_byte8 (string)
 870      Lisp_Object string;
 871 {
 872   int nchars = SCHARS (string);
 873   int nbytes = SBYTES (string);
 874   int multibyte = STRING_MULTIBYTE (string);
 875   int byte8_count;
 876   const unsigned char *src, *src_end;
 877   unsigned char *dst;
 878   Lisp_Object val;
 879   int c, len;
 880
 881   if (multibyte && nchars == nbytes)
 882     return string;
 883
 884   byte8_count = string_count_byte8 (string);
 885
 886   if (byte8_count == 0)
 887     return string;
 888
 889   if (multibyte)
 890     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 891     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 892                                         nbytes + byte8_count * 2);
 893   else
 894     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 895     val = make_uninit_string (nbytes + byte8_count * 3);
 896
 897   src = SDATA (string);
 898   src_end = src + nbytes;
 899   dst = SDATA (val);
 900   if (multibyte)
 901     while (src < src_end)
 902       {
 903         c = *src;
 904         len = BYTES_BY_CHAR_HEAD (c);
 905
 906         if (CHAR_BYTE8_HEAD_P (c))
 907           {
 908             c = STRING_CHAR_ADVANCE (src);
 909             c = CHAR_TO_BYTE8 (c);
 910             sprintf ((char *) dst, "\\%03o", c);
 911             dst += 4;
 912           }
 913         else
 914           while (len--) *dst++ = *src++;
 915       }
 916   else
 917     while (src < src_end)
 918       {
 919         c = *src++;
 920         if (c >= 0x80)
 921           {
 922             sprintf ((char *) dst, "\\%03o", c);
 923             dst += 4;
 924           }
 925         else
 926           *dst++ = c;
 927       }
 928   return val;
 929 }
 930
 931 \f
 932 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 933        doc: /*
 934 Concatenate all the argument characters and make the result a string.
 935 usage: (string &rest CHARACTERS)  */)
 936      (n, args)
 937      int n;
 938      Lisp_Object *args;
 939 {
 940   int i;
 941   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 942   unsigned char *p = buf;
 943   int c;
 944
 945   for (i = 0; i < n; i++)
 946     {
 947       CHECK_CHARACTER (args[i]);
 948       c = XINT (args[i]);
 949       p += CHAR_STRING (c, p);
 950     }
 951
 952   return make_string_from_bytes ((char *) buf, n, p - buf);
 953 }
 954
 955 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 956        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 957 usage: (unibyte-string &rest BYTES)  */)
 958      (n, args)
 959      int n;
 960      Lisp_Object *args;
 961 {
 962   int i;
 963   unsigned char *buf = (unsigned char *) alloca (n);
 964   unsigned char *p = buf;
 965   unsigned c;
 966
 967   for (i = 0; i < n; i++)
 968     {
 969       CHECK_NATNUM (args[i]);
 970       c = XFASTINT (args[i]);
 971       if (c >= 256)
 972         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 973       *p++ = c;
 974     }
 975
 976   return make_string_from_bytes ((char *) buf, n, p - buf);
 977 }
 978
 979 DEFUN ("char-resolve-modifers", Fchar_resolve_modifiers,
 980        Schar_resolve_modifiers, 1, 1, 0,
 981        doc: /* Resolve modifiers in the character CHAR.
 982 The value is a character with modifiers resolved into the character
 983 code.  Unresolved modifiers are kept in the value.
 984 usage: (char-resolve-modifers CHAR)  */)
 985      (character)
 986      Lisp_Object character;
 987 {
 988   int c;
 989
 990   CHECK_NUMBER (character);
 991   c = XINT (character);
 992   return make_number (char_resolve_modifier_mask (c));
 993 }
 994
 995 void
 996 init_character_once ()
 997 {
 998 }
 999
1000 #ifdef emacs
1001
1002 void
1003 syms_of_character ()
1004 {
1005   DEFSYM (Qcharacterp, "characterp");
1006   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1007
1008   staticpro (&Vchar_unify_table);
1009   Vchar_unify_table = Qnil;
1010
1011   defsubr (&Smax_char);
1012   defsubr (&Scharacterp);
1013   defsubr (&Sunibyte_char_to_multibyte);
1014   defsubr (&Smultibyte_char_to_unibyte);
1015   defsubr (&Schar_bytes);
1016   defsubr (&Schar_width);
1017   defsubr (&Sstring_width);
1018   defsubr (&Schar_direction);
1019   defsubr (&Sstring);
1020   defsubr (&Sunibyte_string);
1021   defsubr (&Schar_resolve_modifiers);
1022
1023   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1024                doc: /*
1025 Vector recording all translation tables ever defined.
1026 Each element is a pair (SYMBOL . TABLE) relating the table to the
1027 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1028   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1029
1030   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1031                doc: /*
1032 A char-table for characters which invoke auto-filling.
1033 Such characters have value t in this table.  */);
1034   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1035   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1036   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1037
1038   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1039                doc: /*
1040 A char-table for width (columns) of each character.  */);
1041   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1042   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1043   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1044                         make_number (4));
1045
1046   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1047                doc: /* A char-table for direction of each character.  */);
1048   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1049
1050   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1051                doc: /* A char-table for each printable character.  */);
1052   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1053   Fset_char_table_range (Vprintable_chars,
1054                          Fcons (make_number (32), make_number (126)), Qt);
1055   Fset_char_table_range (Vprintable_chars,
1056                          Fcons (make_number (160),
1057                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1058
1059   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1060                doc: /* Char table of script symbols.
1061 It has one extra slot whose value is a list of script symbols.  */);
1062
1063   /* Intern this now in case it isn't already done.
1064      Setting this variable twice is harmless.
1065      But don't staticpro it here--that is done in alloc.c.  */
1066   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1067   DEFSYM (Qchar_script_table, "char-script-table");
1068   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1069   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1070
1071   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1072                doc: /* Alist of scripts vs the representative characters.  */);
1073   Vscript_representative_chars = Qnil;
1074 }
1075
1076 #endif /* emacs */
1077
1078 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1079    (do not change this comment) */