src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include "lisp.h"
  38 #include "character.h"
  39 #include "buffer.h"
  40 #include "charset.h"
  41 #include "composite.h"
  42 #include "disptab.h"
  43
  44 #else  /* not emacs */
  45
  46 #include "mulelib.h"
  47
  48 #endif /* emacs */
  49
  50 Lisp_Object Qcharacterp;
  51
  52 /* Vector of translation table ever defined.
  53    ID of a translation table is used to index this vector.  */
  54 Lisp_Object Vtranslation_table_vector;
  55
  56 /* A char-table for characters which may invoke auto-filling.  */
  57 Lisp_Object Vauto_fill_chars;
  58
  59 Lisp_Object Qauto_fill_chars;
  60
  61 /* Char-table of information about which character to unify to which
  62    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  63 Lisp_Object Vchar_unify_table;
  64
  65 /* A char-table.  An element is non-nil iff the corresponding
  66    character has a printable glyph.  */
  67 Lisp_Object Vprintable_chars;
  68
  69 /* A char-table.  An elemnent is a column-width of the corresponding
  70    character.  */
  71 Lisp_Object Vchar_width_table;
  72
  73 /* A char-table.  An element is a symbol indicating the direction
  74    property of corresponding character.  */
  75 Lisp_Object Vchar_direction_table;
  76
  77 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  78 unsigned char *_fetch_multibyte_char_p;
  79
  80 /* Char table of scripts.  */
  81 Lisp_Object Vchar_script_table;
  82
  83 /* Alist of scripts vs representative characters.  */
  84 Lisp_Object Vscript_representative_chars;
  85
  86 static Lisp_Object Qchar_script_table;
  87
  88 Lisp_Object Vunicode_category_table;
  89 \f
  90
  91 /* If character code C has modifier masks, reflect them to the
  92    character code if possible.  Return the resulting code.  */
  93
  94 int
  95 char_resolve_modifier_mask (c)
  96      int c;
  97 {
  98   /* A non-ASCII character can't reflect modifier bits to the code.  */
  99   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 100     return c;
 101
 102   /* For Meta, Shift, and Control modifiers, we need special care.  */
 103   if (c & CHAR_SHIFT)
 104     {
 105       /* Shift modifier is valid only with [A-Za-z].  */
 106       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 107         c &= ~CHAR_SHIFT;
 108       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 109         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 110       /* Shift modifier for control characters and SPC is ignored.  */
 111       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 112         c &= ~CHAR_SHIFT;
 113     }
 114   if (c & CHAR_CTL)
 115     {
 116       /* Simulate the code in lread.c.  */
 117       /* Allow `\C- ' and `\C-?'.  */
 118       if ((c & 0377) == ' ')
 119         c &= ~0177 & ~ CHAR_CTL;
 120       else if ((c & 0377) == '?')
 121         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 122       /* ASCII control chars are made from letters (both cases),
 123          as well as the non-letters within 0100...0137.  */
 124       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 125         c &= (037 | (~0177 & ~CHAR_CTL));
 126       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 127         c &= (037 | (~0177 & ~CHAR_CTL));
 128     }
 129   if (c & CHAR_META)
 130     {
 131       /* Move the meta bit to the right place for a string.  */
 132       c = (c & ~CHAR_META) | 0x80;
 133     }
 134
 135   return c;
 136 }
 137
 138
 139 /* Store multibyte form of character C at P.  If C has modifier bits,
 140    handle them appropriately.  */
 141
 142 int
 143 char_string (c, p)
 144      unsigned c;
 145      unsigned char *p;
 146 {
 147   int bytes;
 148
 149   if (c & CHAR_MODIFIER_MASK)
 150     {
 151       c = (unsigned) char_resolve_modifier_mask ((int) c);
 152       /* If C still has any modifier bits, just ignore it.  */
 153       c &= ~CHAR_MODIFIER_MASK;
 154     }
 155
 156   MAYBE_UNIFY_CHAR (c);
 157
 158   if (c <= MAX_3_BYTE_CHAR)
 159     {
 160       bytes = CHAR_STRING (c, p);
 161     }
 162   else if (c <= MAX_4_BYTE_CHAR)
 163     {
 164       p[0] = (0xF0 | (c >> 18));
 165       p[1] = (0x80 | ((c >> 12) & 0x3F));
 166       p[2] = (0x80 | ((c >> 6) & 0x3F));
 167       p[3] = (0x80 | (c & 0x3F));
 168       bytes = 4;
 169     }
 170   else if (c <= MAX_5_BYTE_CHAR)
 171     {
 172       p[0] = 0xF8;
 173       p[1] = (0x80 | ((c >> 18) & 0x0F));
 174       p[2] = (0x80 | ((c >> 12) & 0x3F));
 175       p[3] = (0x80 | ((c >> 6) & 0x3F));
 176       p[4] = (0x80 | (c & 0x3F));
 177       bytes = 5;
 178     }
 179   else if (c <= MAX_CHAR)
 180     {
 181       c = CHAR_TO_BYTE8 (c);
 182       bytes = BYTE8_STRING (c, p);
 183     }
 184   else
 185     error ("Invalid character: %d", c);
 186
 187   return bytes;
 188 }
 189
 190
 191 /* Return a character whose multibyte form is at P.  Set LEN is not
 192    NULL, it must be a pointer to integer.  In that case, set *LEN to
 193    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 194    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 195    the ending address (i.e. the starting address of the next
 196    character) of the multibyte form.  */
 197
 198 int
 199 string_char (p, advanced, len)
 200      const unsigned char *p;
 201      const unsigned char **advanced;
 202      int *len;
 203 {
 204   int c;
 205   const unsigned char *saved_p = p;
 206
 207   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 208     {
 209       c = STRING_CHAR_ADVANCE (p);
 210     }
 211   else if (! (*p & 0x08))
 212     {
 213       c = ((((p)[0] & 0xF) << 18)
 214            | (((p)[1] & 0x3F) << 12)
 215            | (((p)[2] & 0x3F) << 6)
 216            | ((p)[3] & 0x3F));
 217       p += 4;
 218     }
 219   else
 220     {
 221       c = ((((p)[1] & 0x3F) << 18)
 222            | (((p)[2] & 0x3F) << 12)
 223            | (((p)[3] & 0x3F) << 6)
 224            | ((p)[4] & 0x3F));
 225       p += 5;
 226     }
 227
 228   MAYBE_UNIFY_CHAR (c);
 229
 230   if (len)
 231     *len = p - saved_p;
 232   if (advanced)
 233     *advanced = p;
 234   return c;
 235 }
 236
 237
 238 /* Translate character C by translation table TABLE.  If C is
 239    negative, translate a character specified by CHARSET and CODE.  If
 240    no translation is found in TABLE, return the untranslated
 241    character.  If TABLE is a list, elements are char tables.  In this
 242    case, translace C by all tables.  */
 243
 244 int
 245 translate_char (table, c)
 246      Lisp_Object table;
 247      int c;
 248 {
 249   if (CHAR_TABLE_P (table))
 250     {
 251       Lisp_Object ch;
 252
 253       ch = CHAR_TABLE_REF (table, c);
 254       if (CHARACTERP (ch))
 255         c = XINT (ch);
 256     }
 257   else
 258     {
 259       for (; CONSP (table); table = XCDR (table))
 260         c = translate_char (XCAR (table), c);
 261     }
 262   return c;
 263 }
 264
 265 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 266    them, return (C & 0xFF).
 267
 268    The argument REV_TBL is now ignored.  It will be removed in the
 269    future.  */
 270
 271 int
 272 multibyte_char_to_unibyte (c, rev_tbl)
 273      int c;
 274      Lisp_Object rev_tbl;
 275 {
 276   if (c < 0x80)
 277     return c;
 278   if (CHAR_BYTE8_P (c))
 279     return CHAR_TO_BYTE8 (c);
 280   return (c & 0xFF);
 281 }
 282
 283 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 284    by charset_unibyte.  */
 285
 286 int
 287 multibyte_char_to_unibyte_safe (c)
 288      int c;
 289 {
 290   if (c < 0x80)
 291     return c;
 292   if (CHAR_BYTE8_P (c))
 293     return CHAR_TO_BYTE8 (c);
 294   return -1;
 295 }
 296
 297 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 298        doc: /* Return non-nil if OBJECT is a character.  */)
 299      (object, ignore)
 300      Lisp_Object object, ignore;
 301 {
 302   return (CHARACTERP (object) ? Qt : Qnil);
 303 }
 304
 305 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 306        doc: /* Return the character of the maximum code.  */)
 307      ()
 308 {
 309   return make_number (MAX_CHAR);
 310 }
 311
 312 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 313        Sunibyte_char_to_multibyte, 1, 1, 0,
 314        doc: /* Convert the byte CH to multibyte character.  */)
 315      (ch)
 316      Lisp_Object ch;
 317 {
 318   int c;
 319
 320   CHECK_CHARACTER (ch);
 321   c = XFASTINT (ch);
 322   if (c >= 0x100)
 323     error ("Not a unibyte character: %d", c);
 324   MAKE_CHAR_MULTIBYTE (c);
 325   return make_number (c);
 326 }
 327
 328 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 329        Smultibyte_char_to_unibyte, 1, 1, 0,
 330        doc: /* Convert the multibyte character CH to a byte.
 331 If the multibyte character does not represent a byte, return -1.  */)
 332      (ch)
 333      Lisp_Object ch;
 334 {
 335   int cm;
 336
 337   CHECK_CHARACTER (ch);
 338   cm = XFASTINT (ch);
 339   if (cm < 256)
 340     /* Can't distinguish a byte read from a unibyte buffer from
 341        a latin1 char, so let's let it slide.  */
 342     return ch;
 343   else
 344     {
 345       int cu = CHAR_TO_BYTE_SAFE (cm);
 346       return make_number (cu);
 347     }
 348 }
 349
 350 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 351        doc: /* Return 1 regardless of the argument CHAR.
 352 This is now an obsolete function.  We keep it just for backward compatibility.
 353 usage: (char-bytes CHAR)  */)
 354      (ch)
 355      Lisp_Object ch;
 356 {
 357   CHECK_CHARACTER (ch);
 358   return make_number (1);
 359 }
 360
 361 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 362        doc: /* Return width of CHAR when displayed in the current buffer.
 363 The width is measured by how many columns it occupies on the screen.
 364 Tab is taken to occupy `tab-width' columns.
 365 usage: (char-width CHAR)  */)
 366      (ch)
 367        Lisp_Object ch;
 368 {
 369   Lisp_Object disp;
 370   int c, width;
 371   struct Lisp_Char_Table *dp = buffer_display_table ();
 372
 373   CHECK_CHARACTER (ch);
 374   c = XINT (ch);
 375
 376   /* Get the way the display table would display it.  */
 377   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 378
 379   if (VECTORP (disp))
 380     width = ASIZE (disp);
 381   else
 382     width = CHAR_WIDTH (c);
 383
 384   return make_number (width);
 385 }
 386
 387 /* Return width of string STR of length LEN when displayed in the
 388    current buffer.  The width is measured by how many columns it
 389    occupies on the screen.  If PRECISION > 0, return the width of
 390    longest substring that doesn't exceed PRECISION, and set number of
 391    characters and bytes of the substring in *NCHARS and *NBYTES
 392    respectively.  */
 393
 394 int
 395 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
 396 {
 397   int i = 0, i_byte = 0;
 398   int width = 0;
 399   struct Lisp_Char_Table *dp = buffer_display_table ();
 400
 401   while (i_byte < len)
 402     {
 403       int bytes, thiswidth;
 404       Lisp_Object val;
 405       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 406
 407       if (dp)
 408         {
 409           val = DISP_CHAR_VECTOR (dp, c);
 410           if (VECTORP (val))
 411             thiswidth = XVECTOR (val)->size;
 412           else
 413             thiswidth = CHAR_WIDTH (c);
 414         }
 415       else
 416         {
 417           thiswidth = CHAR_WIDTH (c);
 418         }
 419
 420       if (precision > 0
 421           && (width + thiswidth > precision))
 422         {
 423           *nchars = i;
 424           *nbytes = i_byte;
 425           return width;
 426         }
 427       i++;
 428       i_byte += bytes;
 429       width += thiswidth;
 430   }
 431
 432   if (precision > 0)
 433     {
 434       *nchars = i;
 435       *nbytes = i_byte;
 436     }
 437
 438   return width;
 439 }
 440
 441 /* Return width of string STR of length LEN when displayed in the
 442    current buffer.  The width is measured by how many columns it
 443    occupies on the screen.  */
 444
 445 int
 446 strwidth (str, len)
 447      unsigned char *str;
 448      int len;
 449 {
 450   return c_string_width (str, len, -1, NULL, NULL);
 451 }
 452
 453 /* Return width of Lisp string STRING when displayed in the current
 454    buffer.  The width is measured by how many columns it occupies on
 455    the screen while paying attention to compositions.  If PRECISION >
 456    0, return the width of longest substring that doesn't exceed
 457    PRECISION, and set number of characters and bytes of the substring
 458    in *NCHARS and *NBYTES respectively.  */
 459
 460 int
 461 lisp_string_width (string, precision, nchars, nbytes)
 462      Lisp_Object string;
 463      int precision, *nchars, *nbytes;
 464 {
 465   int len = SCHARS (string);
 466   /* This set multibyte to 0 even if STRING is multibyte when it
 467      contains only ascii and eight-bit-graphic, but that's
 468      intentional.  */
 469   int multibyte = len < SBYTES (string);
 470   unsigned char *str = SDATA (string);
 471   int i = 0, i_byte = 0;
 472   int width = 0;
 473   struct Lisp_Char_Table *dp = buffer_display_table ();
 474
 475   while (i < len)
 476     {
 477       int chars, bytes, thiswidth;
 478       Lisp_Object val;
 479       int cmp_id;
 480       EMACS_INT ignore, end;
 481
 482       if (find_composition (i, -1, &ignore, &end, &val, string)
 483           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 484               >= 0))
 485         {
 486           thiswidth = composition_table[cmp_id]->width;
 487           chars = end - i;
 488           bytes = string_char_to_byte (string, end) - i_byte;
 489         }
 490       else
 491         {
 492           int c;
 493
 494           if (multibyte)
 495             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 496           else
 497             c = str[i_byte], bytes = 1;
 498           chars = 1;
 499           if (dp)
 500             {
 501               val = DISP_CHAR_VECTOR (dp, c);
 502               if (VECTORP (val))
 503                 thiswidth = XVECTOR (val)->size;
 504               else
 505                 thiswidth = CHAR_WIDTH (c);
 506             }
 507           else
 508             {
 509               thiswidth = CHAR_WIDTH (c);
 510             }
 511         }
 512
 513       if (precision > 0
 514           && (width + thiswidth > precision))
 515         {
 516           *nchars = i;
 517           *nbytes = i_byte;
 518           return width;
 519         }
 520       i += chars;
 521       i_byte += bytes;
 522       width += thiswidth;
 523   }
 524
 525   if (precision > 0)
 526     {
 527       *nchars = i;
 528       *nbytes = i_byte;
 529     }
 530
 531   return width;
 532 }
 533
 534 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 535        doc: /* Return width of STRING when displayed in the current buffer.
 536 Width is measured by how many columns it occupies on the screen.
 537 When calculating width of a multibyte character in STRING,
 538 only the base leading-code is considered; the validity of
 539 the following bytes is not checked.  Tabs in STRING are always
 540 taken to occupy `tab-width' columns.
 541 usage: (string-width STRING)  */)
 542      (str)
 543      Lisp_Object str;
 544 {
 545   Lisp_Object val;
 546
 547   CHECK_STRING (str);
 548   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 549   return val;
 550 }
 551
 552 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 553        doc: /* Return the direction of CHAR.
 554 The returned value is 0 for left-to-right and 1 for right-to-left.
 555 usage: (char-direction CHAR)  */)
 556      (ch)
 557      Lisp_Object ch;
 558 {
 559   int c;
 560
 561   CHECK_CHARACTER (ch);
 562   c = XINT (ch);
 563   return CHAR_TABLE_REF (Vchar_direction_table, c);
 564 }
 565
 566 /* Return the number of characters in the NBYTES bytes at PTR.
 567    This works by looking at the contents and checking for multibyte
 568    sequences while assuming that there's no invalid sequence.
 569    However, if the current buffer has enable-multibyte-characters =
 570    nil, we treat each byte as a character.  */
 571
 572 EMACS_INT
 573 chars_in_text (ptr, nbytes)
 574      const unsigned char *ptr;
 575      EMACS_INT nbytes;
 576 {
 577   /* current_buffer is null at early stages of Emacs initialization.  */
 578   if (current_buffer == 0
 579       || NILP (current_buffer->enable_multibyte_characters))
 580     return nbytes;
 581
 582   return multibyte_chars_in_text (ptr, nbytes);
 583 }
 584
 585 /* Return the number of characters in the NBYTES bytes at PTR.
 586    This works by looking at the contents and checking for multibyte
 587    sequences while assuming that there's no invalid sequence.  It
 588    ignores enable-multibyte-characters.  */
 589
 590 EMACS_INT
 591 multibyte_chars_in_text (ptr, nbytes)
 592      const unsigned char *ptr;
 593      EMACS_INT nbytes;
 594 {
 595   const unsigned char *endp = ptr + nbytes;
 596   int chars = 0;
 597
 598   while (ptr < endp)
 599     {
 600       int len = MULTIBYTE_LENGTH (ptr, endp);
 601
 602       if (len == 0)
 603         abort ();
 604       ptr += len;
 605       chars++;
 606     }
 607
 608   return chars;
 609 }
 610
 611 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 612    characters and bytes in it, and store them in *NCHARS and *NBYTES
 613    respectively.  On counting bytes, pay attention to that 8-bit
 614    characters not constructing a valid multibyte sequence are
 615    represented by 2-byte in a multibyte text.  */
 616
 617 void
 618 parse_str_as_multibyte (str, len, nchars, nbytes)
 619      const unsigned char *str;
 620      int len, *nchars, *nbytes;
 621 {
 622   const unsigned char *endp = str + len;
 623   int n, chars = 0, bytes = 0;
 624
 625   if (len >= MAX_MULTIBYTE_LENGTH)
 626     {
 627       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 628       while (str < adjusted_endp)
 629         {
 630           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 631             str += n, bytes += n;
 632           else
 633             str++, bytes += 2;
 634           chars++;
 635         }
 636     }
 637   while (str < endp)
 638     {
 639       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 640         str += n, bytes += n;
 641       else
 642         str++, bytes += 2;
 643       chars++;
 644     }
 645
 646   *nchars = chars;
 647   *nbytes = bytes;
 648   return;
 649 }
 650
 651 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 652    It actually converts only such 8-bit characters that don't contruct
 653    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 654    NCHARS is nonzero, set *NCHARS to the number of characters in the
 655    text.  It is assured that we can use LEN bytes at STR as a work
 656    area and that is enough.  Return the number of bytes of the
 657    resulting text.  */
 658
 659 int
 660 str_as_multibyte (str, len, nbytes, nchars)
 661      unsigned char *str;
 662      int len, nbytes, *nchars;
 663 {
 664   unsigned char *p = str, *endp = str + nbytes;
 665   unsigned char *to;
 666   int chars = 0;
 667   int n;
 668
 669   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 670     {
 671       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 672       while (p < adjusted_endp
 673              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 674         p += n, chars++;
 675     }
 676   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 677     p += n, chars++;
 678   if (nchars)
 679     *nchars = chars;
 680   if (p == endp)
 681     return nbytes;
 682
 683   to = p;
 684   nbytes = endp - p;
 685   endp = str + len;
 686   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 687   p = endp - nbytes;
 688
 689   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 690     {
 691       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 692       while (p < adjusted_endp)
 693         {
 694           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 695             {
 696               while (n--)
 697                 *to++ = *p++;
 698             }
 699           else
 700             {
 701               int c = *p++;
 702               c = BYTE8_TO_CHAR (c);
 703               to += CHAR_STRING (c, to);
 704             }
 705         }
 706       chars++;
 707     }
 708   while (p < endp)
 709     {
 710       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 711         {
 712           while (n--)
 713             *to++ = *p++;
 714         }
 715       else
 716         {
 717           int c = *p++;
 718           c = BYTE8_TO_CHAR (c);
 719           to += CHAR_STRING (c, to);
 720         }
 721       chars++;
 722     }
 723   if (nchars)
 724     *nchars = chars;
 725   return (to - str);
 726 }
 727
 728 /* Parse unibyte string at STR of LEN bytes, and return the number of
 729    bytes it may ocupy when converted to multibyte string by
 730    `str_to_multibyte'.  */
 731
 732 int
 733 parse_str_to_multibyte (str, len)
 734      unsigned char *str;
 735      int len;
 736 {
 737   unsigned char *endp = str + len;
 738   int bytes;
 739
 740   for (bytes = 0; str < endp; str++)
 741     bytes += (*str < 0x80) ? 1 : 2;
 742   return bytes;
 743 }
 744
 745
 746 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 747    that contains the same single-byte characters.  It actually
 748    converts all 8-bit characters to multibyte forms.  It is assured
 749    that we can use LEN bytes at STR as a work area and that is
 750    enough.  */
 751
 752 int
 753 str_to_multibyte (str, len, bytes)
 754      unsigned char *str;
 755      int len, bytes;
 756 {
 757   unsigned char *p = str, *endp = str + bytes;
 758   unsigned char *to;
 759
 760   while (p < endp && *p < 0x80) p++;
 761   if (p == endp)
 762     return bytes;
 763   to = p;
 764   bytes = endp - p;
 765   endp = str + len;
 766   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 767   p = endp - bytes;
 768   while (p < endp)
 769     {
 770       int c = *p++;
 771
 772       if (c >= 0x80)
 773         c = BYTE8_TO_CHAR (c);
 774       to += CHAR_STRING (c, to);
 775     }
 776   return (to - str);
 777 }
 778
 779 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 780    actually converts characters in the range 0x80..0xFF to
 781    unibyte.  */
 782
 783 int
 784 str_as_unibyte (str, bytes)
 785      unsigned char *str;
 786      int bytes;
 787 {
 788   const unsigned char *p = str, *endp = str + bytes;
 789   unsigned char *to;
 790   int c, len;
 791
 792   while (p < endp)
 793     {
 794       c = *p;
 795       len = BYTES_BY_CHAR_HEAD (c);
 796       if (CHAR_BYTE8_HEAD_P (c))
 797         break;
 798       p += len;
 799     }
 800   to = str + (p - str);
 801   while (p < endp)
 802     {
 803       c = *p;
 804       len = BYTES_BY_CHAR_HEAD (c);
 805       if (CHAR_BYTE8_HEAD_P (c))
 806         {
 807           c = STRING_CHAR_ADVANCE (p);
 808           *to++ = CHAR_TO_BYTE8 (c);
 809         }
 810       else
 811         {
 812           while (len--) *to++ = *p++;
 813         }
 814     }
 815   return (to - str);
 816 }
 817
 818 /* Convert eight-bit chars in SRC (in multibyte form) to the
 819    corresponding byte and store in DST.  CHARS is the number of
 820    characters in SRC.  The value is the number of bytes stored in DST.
 821    Usually, the value is the same as CHARS, but is less than it if SRC
 822    contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
 823    is nonzero, a Latin-1 character is accepted and converted to a byte
 824    of that character code.
 825    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 826
 827 EMACS_INT
 828 str_to_unibyte (src, dst, chars, accept_latin_1)
 829      const unsigned char *src;
 830      unsigned char *dst;
 831      EMACS_INT chars;
 832      int accept_latin_1;
 833 {
 834   EMACS_INT i;
 835
 836   for (i = 0; i < chars; i++)
 837     {
 838       int c = STRING_CHAR_ADVANCE (src);
 839
 840       if (CHAR_BYTE8_P (c))
 841         c = CHAR_TO_BYTE8 (c);
 842       else if (! ASCII_CHAR_P (c)
 843                && (! accept_latin_1 || c >= 0x100))
 844         return i;
 845       *dst++ = c;
 846     }
 847   return i;
 848 }
 849
 850
 851 int
 852 string_count_byte8 (string)
 853      Lisp_Object string;
 854 {
 855   int multibyte = STRING_MULTIBYTE (string);
 856   int nbytes = SBYTES (string);
 857   unsigned char *p = SDATA (string);
 858   unsigned char *pend = p + nbytes;
 859   int count = 0;
 860   int c, len;
 861
 862   if (multibyte)
 863     while (p < pend)
 864       {
 865         c = *p;
 866         len = BYTES_BY_CHAR_HEAD (c);
 867
 868         if (CHAR_BYTE8_HEAD_P (c))
 869           count++;
 870         p += len;
 871       }
 872   else
 873     while (p < pend)
 874       {
 875         if (*p++ >= 0x80)
 876           count++;
 877       }
 878   return count;
 879 }
 880
 881
 882 Lisp_Object
 883 string_escape_byte8 (string)
 884      Lisp_Object string;
 885 {
 886   int nchars = SCHARS (string);
 887   int nbytes = SBYTES (string);
 888   int multibyte = STRING_MULTIBYTE (string);
 889   int byte8_count;
 890   const unsigned char *src, *src_end;
 891   unsigned char *dst;
 892   Lisp_Object val;
 893   int c, len;
 894
 895   if (multibyte && nchars == nbytes)
 896     return string;
 897
 898   byte8_count = string_count_byte8 (string);
 899
 900   if (byte8_count == 0)
 901     return string;
 902
 903   if (multibyte)
 904     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 905     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 906                                         nbytes + byte8_count * 2);
 907   else
 908     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 909     val = make_uninit_string (nbytes + byte8_count * 3);
 910
 911   src = SDATA (string);
 912   src_end = src + nbytes;
 913   dst = SDATA (val);
 914   if (multibyte)
 915     while (src < src_end)
 916       {
 917         c = *src;
 918         len = BYTES_BY_CHAR_HEAD (c);
 919
 920         if (CHAR_BYTE8_HEAD_P (c))
 921           {
 922             c = STRING_CHAR_ADVANCE (src);
 923             c = CHAR_TO_BYTE8 (c);
 924             sprintf ((char *) dst, "\\%03o", c);
 925             dst += 4;
 926           }
 927         else
 928           while (len--) *dst++ = *src++;
 929       }
 930   else
 931     while (src < src_end)
 932       {
 933         c = *src++;
 934         if (c >= 0x80)
 935           {
 936             sprintf ((char *) dst, "\\%03o", c);
 937             dst += 4;
 938           }
 939         else
 940           *dst++ = c;
 941       }
 942   return val;
 943 }
 944
 945 \f
 946 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 947        doc: /*
 948 Concatenate all the argument characters and make the result a string.
 949 usage: (string &rest CHARACTERS)  */)
 950      (n, args)
 951      int n;
 952      Lisp_Object *args;
 953 {
 954   int i;
 955   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 956   unsigned char *p = buf;
 957   int c;
 958
 959   for (i = 0; i < n; i++)
 960     {
 961       CHECK_CHARACTER (args[i]);
 962       c = XINT (args[i]);
 963       p += CHAR_STRING (c, p);
 964     }
 965
 966   return make_string_from_bytes ((char *) buf, n, p - buf);
 967 }
 968
 969 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 970        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 971 usage: (unibyte-string &rest BYTES)  */)
 972      (n, args)
 973      int n;
 974      Lisp_Object *args;
 975 {
 976   int i;
 977   unsigned char *buf = (unsigned char *) alloca (n);
 978   unsigned char *p = buf;
 979   unsigned c;
 980
 981   for (i = 0; i < n; i++)
 982     {
 983       CHECK_NATNUM (args[i]);
 984       c = XFASTINT (args[i]);
 985       if (c >= 256)
 986         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 987       *p++ = c;
 988     }
 989
 990   return make_string_from_bytes ((char *) buf, n, p - buf);
 991 }
 992
 993 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 994        Schar_resolve_modifiers, 1, 1, 0,
 995        doc: /* Resolve modifiers in the character CHAR.
 996 The value is a character with modifiers resolved into the character
 997 code.  Unresolved modifiers are kept in the value.
 998 usage: (char-resolve-modifiers CHAR)  */)
 999      (character)
1000      Lisp_Object character;
1001 {
1002   int c;
1003
1004   CHECK_NUMBER (character);
1005   c = XINT (character);
1006   return make_number (char_resolve_modifier_mask (c));
1007 }
1008
1009 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1010        doc: /* Return a byte value of a character at point.
1011 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1012 a byte value.
1013 Optional 2nd arg STRING, if non-nil, is a string of which first
1014 character is a target to get a byte value.  In this case, POSITION, if
1015 non-nil, is an index of a target character in the string.
1016
1017 If the current buffer (or STRING) is multibyte, and the target
1018 character is not ASCII nor 8-bit character, an error is signalled.  */)
1019      (position, string)
1020      Lisp_Object position, string;
1021 {
1022   int c;
1023   EMACS_INT pos;
1024   unsigned char *p;
1025
1026   if (NILP (string))
1027     {
1028       if (NILP (position))
1029         {
1030           p = PT_ADDR;
1031         }
1032       else
1033         {
1034           CHECK_NUMBER_COERCE_MARKER (position);
1035           if (XINT (position) < BEGV || XINT (position) >= ZV)
1036             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1037           pos = XFASTINT (position);
1038           p = CHAR_POS_ADDR (pos);
1039         }
1040       if (NILP (current_buffer->enable_multibyte_characters))
1041         return make_number (*p);
1042     }
1043   else
1044     {
1045       CHECK_STRING (string);
1046       if (NILP (position))
1047         {
1048           p = SDATA (string);
1049         }
1050       else
1051         {
1052           CHECK_NATNUM (position);
1053           if (XINT (position) >= SCHARS (string))
1054             args_out_of_range (string, position);
1055           pos = XFASTINT (position);
1056           p = SDATA (string) + string_char_to_byte (string, pos);
1057         }
1058       if (! STRING_MULTIBYTE (string))
1059         return make_number (*p);
1060     }
1061   c = STRING_CHAR (p, 0);
1062   if (CHAR_BYTE8_P (c))
1063     c = CHAR_TO_BYTE8 (c);
1064   else if (! ASCII_CHAR_P (c))
1065     error ("Not an ASCII nor an 8-bit character: %d", c);
1066   return make_number (c);
1067 }
1068
1069
1070 void
1071 init_character_once ()
1072 {
1073 }
1074
1075 #ifdef emacs
1076
1077 void
1078 syms_of_character ()
1079 {
1080   DEFSYM (Qcharacterp, "characterp");
1081   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1082
1083   staticpro (&Vchar_unify_table);
1084   Vchar_unify_table = Qnil;
1085
1086   defsubr (&Smax_char);
1087   defsubr (&Scharacterp);
1088   defsubr (&Sunibyte_char_to_multibyte);
1089   defsubr (&Smultibyte_char_to_unibyte);
1090   defsubr (&Schar_bytes);
1091   defsubr (&Schar_width);
1092   defsubr (&Sstring_width);
1093   defsubr (&Schar_direction);
1094   defsubr (&Sstring);
1095   defsubr (&Sunibyte_string);
1096   defsubr (&Schar_resolve_modifiers);
1097   defsubr (&Sget_byte);
1098
1099   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1100                doc: /*
1101 Vector recording all translation tables ever defined.
1102 Each element is a pair (SYMBOL . TABLE) relating the table to the
1103 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1104   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1105
1106   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1107                doc: /*
1108 A char-table for characters which invoke auto-filling.
1109 Such characters have value t in this table.  */);
1110   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1111   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1112   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1113
1114   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1115                doc: /*
1116 A char-table for width (columns) of each character.  */);
1117   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1118   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1119   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1120                         make_number (4));
1121
1122   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1123                doc: /* A char-table for direction of each character.  */);
1124   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1125
1126   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1127                doc: /* A char-table for each printable character.  */);
1128   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1129   Fset_char_table_range (Vprintable_chars,
1130                          Fcons (make_number (32), make_number (126)), Qt);
1131   Fset_char_table_range (Vprintable_chars,
1132                          Fcons (make_number (160),
1133                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1134
1135   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1136                doc: /* Char table of script symbols.
1137 It has one extra slot whose value is a list of script symbols.  */);
1138
1139   /* Intern this now in case it isn't already done.
1140      Setting this variable twice is harmless.
1141      But don't staticpro it here--that is done in alloc.c.  */
1142   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1143   DEFSYM (Qchar_script_table, "char-script-table");
1144   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1145   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1146
1147   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1148                doc: /* Alist of scripts vs the representative characters.
1149 Each element is a cons (SCRIPT . CHARS).
1150 SCRIPT is a symbol representing a script or a subgroup of a script.
1151 CHARS is a list or a vector of characters.
1152 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1153 If it is a vector, one of the characters in the vector is necessary.
1154 This variable is used to find a font for a specific script.  */);
1155   Vscript_representative_chars = Qnil;
1156
1157   DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1158                doc: /* Char table of Unicode's "General Category".
1159 All Unicode characters have one of the following values (symbol):
1160   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1161   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1162 See The Unicode Standard for the meaning of those values.  */);
1163   /* The correct char-table is setup in characters.el.  */
1164   Vunicode_category_table = Qnil;
1165 }
1166
1167 #endif /* emacs */
1168
1169 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1170    (do not change this comment) */