src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 /* Char-table of information about which character to unify to which
  63    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  64 Lisp_Object Vchar_unify_table;
  65
  66 /* A char-table.  An element is non-nil iff the corresponding
  67    character has a printable glyph.  */
  68 Lisp_Object Vprintable_chars;
  69
  70 /* A char-table.  An elemnent is a column-width of the corresponding
  71    character.  */
  72 Lisp_Object Vchar_width_table;
  73
  74 /* A char-table.  An element is a symbol indicating the direction
  75    property of corresponding character.  */
  76 Lisp_Object Vchar_direction_table;
  77
  78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  79 unsigned char *_fetch_multibyte_char_p;
  80
  81 /* Char table of scripts.  */
  82 Lisp_Object Vchar_script_table;
  83
  84 /* Alist of scripts vs representative characters.  */
  85 Lisp_Object Vscript_representative_chars;
  86
  87 static Lisp_Object Qchar_script_table;
  88
  89 Lisp_Object Vunicode_category_table;
  90 \f
  91
  92 /* If character code C has modifier masks, reflect them to the
  93    character code if possible.  Return the resulting code.  */
  94
  95 int
  96 char_resolve_modifier_mask (c)
  97      int c;
  98 {
  99   /* A non-ASCII character can't reflect modifier bits to the code.  */
 100   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 101     return c;
 102
 103   /* For Meta, Shift, and Control modifiers, we need special care.  */
 104   if (c & CHAR_SHIFT)
 105     {
 106       /* Shift modifier is valid only with [A-Za-z].  */
 107       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 108         c &= ~CHAR_SHIFT;
 109       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 110         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 111       /* Shift modifier for control characters and SPC is ignored.  */
 112       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 113         c &= ~CHAR_SHIFT;
 114     }
 115   if (c & CHAR_CTL)
 116     {
 117       /* Simulate the code in lread.c.  */
 118       /* Allow `\C- ' and `\C-?'.  */
 119       if ((c & 0377) == ' ')
 120         c &= ~0177 & ~ CHAR_CTL;
 121       else if ((c & 0377) == '?')
 122         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 123       /* ASCII control chars are made from letters (both cases),
 124          as well as the non-letters within 0100...0137.  */
 125       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 126         c &= (037 | (~0177 & ~CHAR_CTL));
 127       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 128         c &= (037 | (~0177 & ~CHAR_CTL));
 129     }
 130   if (c & CHAR_META)
 131     {
 132       /* Move the meta bit to the right place for a string.  */
 133       c = (c & ~CHAR_META) | 0x80;
 134     }
 135
 136   return c;
 137 }
 138
 139
 140 /* Store multibyte form of character C at P.  If C has modifier bits,
 141    handle them appropriately.  */
 142
 143 int
 144 char_string (c, p)
 145      unsigned c;
 146      unsigned char *p;
 147 {
 148   int bytes;
 149
 150   if (c & CHAR_MODIFIER_MASK)
 151     {
 152       c = (unsigned) char_resolve_modifier_mask ((int) c);
 153       /* If C still has any modifier bits, just ignore it.  */
 154       c &= ~CHAR_MODIFIER_MASK;
 155     }
 156
 157   MAYBE_UNIFY_CHAR (c);
 158
 159   if (c <= MAX_3_BYTE_CHAR)
 160     {
 161       bytes = CHAR_STRING (c, p);
 162     }
 163   else if (c <= MAX_4_BYTE_CHAR)
 164     {
 165       p[0] = (0xF0 | (c >> 18));
 166       p[1] = (0x80 | ((c >> 12) & 0x3F));
 167       p[2] = (0x80 | ((c >> 6) & 0x3F));
 168       p[3] = (0x80 | (c & 0x3F));
 169       bytes = 4;
 170     }
 171   else if (c <= MAX_5_BYTE_CHAR)
 172     {
 173       p[0] = 0xF8;
 174       p[1] = (0x80 | ((c >> 18) & 0x0F));
 175       p[2] = (0x80 | ((c >> 12) & 0x3F));
 176       p[3] = (0x80 | ((c >> 6) & 0x3F));
 177       p[4] = (0x80 | (c & 0x3F));
 178       bytes = 5;
 179     }
 180   else if (c <= MAX_CHAR)
 181     {
 182       c = CHAR_TO_BYTE8 (c);
 183       bytes = BYTE8_STRING (c, p);
 184     }
 185   else
 186     error ("Invalid character: %d", c);
 187
 188   return bytes;
 189 }
 190
 191
 192 /* Return a character whose multibyte form is at P.  Set LEN is not
 193    NULL, it must be a pointer to integer.  In that case, set *LEN to
 194    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 195    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 196    the ending address (i.e. the starting address of the next
 197    character) of the multibyte form.  */
 198
 199 int
 200 string_char (p, advanced, len)
 201      const unsigned char *p;
 202      const unsigned char **advanced;
 203      int *len;
 204 {
 205   int c;
 206   const unsigned char *saved_p = p;
 207
 208   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 209     {
 210       c = STRING_CHAR_ADVANCE (p);
 211     }
 212   else if (! (*p & 0x08))
 213     {
 214       c = ((((p)[0] & 0xF) << 18)
 215            | (((p)[1] & 0x3F) << 12)
 216            | (((p)[2] & 0x3F) << 6)
 217            | ((p)[3] & 0x3F));
 218       p += 4;
 219     }
 220   else
 221     {
 222       c = ((((p)[1] & 0x3F) << 18)
 223            | (((p)[2] & 0x3F) << 12)
 224            | (((p)[3] & 0x3F) << 6)
 225            | ((p)[4] & 0x3F));
 226       p += 5;
 227     }
 228
 229   MAYBE_UNIFY_CHAR (c);
 230
 231   if (len)
 232     *len = p - saved_p;
 233   if (advanced)
 234     *advanced = p;
 235   return c;
 236 }
 237
 238
 239 /* Translate character C by translation table TABLE.  If C is
 240    negative, translate a character specified by CHARSET and CODE.  If
 241    no translation is found in TABLE, return the untranslated
 242    character.  If TABLE is a list, elements are char tables.  In this
 243    case, translace C by all tables.  */
 244
 245 int
 246 translate_char (table, c)
 247      Lisp_Object table;
 248      int c;
 249 {
 250   if (CHAR_TABLE_P (table))
 251     {
 252       Lisp_Object ch;
 253
 254       ch = CHAR_TABLE_REF (table, c);
 255       if (CHARACTERP (ch))
 256         c = XINT (ch);
 257     }
 258   else
 259     {
 260       for (; CONSP (table); table = XCDR (table))
 261         c = translate_char (XCAR (table), c);
 262     }
 263   return c;
 264 }
 265
 266 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 267    them, return (C & 0xFF).
 268
 269    The argument REV_TBL is now ignored.  It will be removed in the
 270    future.  */
 271
 272 int
 273 multibyte_char_to_unibyte (c, rev_tbl)
 274      int c;
 275      Lisp_Object rev_tbl;
 276 {
 277   if (c < 0x80)
 278     return c;
 279   if (CHAR_BYTE8_P (c))
 280     return CHAR_TO_BYTE8 (c);
 281   return (c & 0xFF);
 282 }
 283
 284 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 285    by charset_unibyte.  */
 286
 287 int
 288 multibyte_char_to_unibyte_safe (c)
 289      int c;
 290 {
 291   if (c < 0x80)
 292     return c;
 293   if (CHAR_BYTE8_P (c))
 294     return CHAR_TO_BYTE8 (c);
 295   return -1;
 296 }
 297
 298 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 299        doc: /* Return non-nil if OBJECT is a character.  */)
 300      (object, ignore)
 301      Lisp_Object object, ignore;
 302 {
 303   return (CHARACTERP (object) ? Qt : Qnil);
 304 }
 305
 306 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 307        doc: /* Return the character of the maximum code.  */)
 308      ()
 309 {
 310   return make_number (MAX_CHAR);
 311 }
 312
 313 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 314        Sunibyte_char_to_multibyte, 1, 1, 0,
 315        doc: /* Convert the byte CH to multibyte character.  */)
 316      (ch)
 317      Lisp_Object ch;
 318 {
 319   int c;
 320
 321   CHECK_CHARACTER (ch);
 322   c = XFASTINT (ch);
 323   if (c >= 0x100)
 324     error ("Not a unibyte character: %d", c);
 325   MAKE_CHAR_MULTIBYTE (c);
 326   return make_number (c);
 327 }
 328
 329 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 330        Smultibyte_char_to_unibyte, 1, 1, 0,
 331        doc: /* Convert the multibyte character CH to a byte.
 332 If the multibyte character does not represent a byte, return -1.  */)
 333      (ch)
 334      Lisp_Object ch;
 335 {
 336   int cm;
 337
 338   CHECK_CHARACTER (ch);
 339   cm = XFASTINT (ch);
 340   if (cm < 256)
 341     /* Can't distinguish a byte read from a unibyte buffer from
 342        a latin1 char, so let's let it slide.  */
 343     return ch;
 344   else
 345     {
 346       int cu = CHAR_TO_BYTE_SAFE (cm);
 347       return make_number (cu);
 348     }
 349 }
 350
 351 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 352        doc: /* Return 1 regardless of the argument CHAR.
 353 This is now an obsolete function.  We keep it just for backward compatibility.
 354 usage: (char-bytes CHAR)  */)
 355      (ch)
 356      Lisp_Object ch;
 357 {
 358   CHECK_CHARACTER (ch);
 359   return make_number (1);
 360 }
 361
 362 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 363        doc: /* Return width of CHAR when displayed in the current buffer.
 364 The width is measured by how many columns it occupies on the screen.
 365 Tab is taken to occupy `tab-width' columns.
 366 usage: (char-width CHAR)  */)
 367      (ch)
 368        Lisp_Object ch;
 369 {
 370   Lisp_Object disp;
 371   int c, width;
 372   struct Lisp_Char_Table *dp = buffer_display_table ();
 373
 374   CHECK_CHARACTER (ch);
 375   c = XINT (ch);
 376
 377   /* Get the way the display table would display it.  */
 378   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 379
 380   if (VECTORP (disp))
 381     width = ASIZE (disp);
 382   else
 383     width = CHAR_WIDTH (c);
 384
 385   return make_number (width);
 386 }
 387
 388 /* Return width of string STR of length LEN when displayed in the
 389    current buffer.  The width is measured by how many columns it
 390    occupies on the screen.  If PRECISION > 0, return the width of
 391    longest substring that doesn't exceed PRECISION, and set number of
 392    characters and bytes of the substring in *NCHARS and *NBYTES
 393    respectively.  */
 394
 395 int
 396 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
 397 {
 398   int i = 0, i_byte = 0;
 399   int width = 0;
 400   struct Lisp_Char_Table *dp = buffer_display_table ();
 401
 402   while (i_byte < len)
 403     {
 404       int bytes, thiswidth;
 405       Lisp_Object val;
 406       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 407
 408       if (dp)
 409         {
 410           val = DISP_CHAR_VECTOR (dp, c);
 411           if (VECTORP (val))
 412             thiswidth = XVECTOR (val)->size;
 413           else
 414             thiswidth = CHAR_WIDTH (c);
 415         }
 416       else
 417         {
 418           thiswidth = CHAR_WIDTH (c);
 419         }
 420
 421       if (precision > 0
 422           && (width + thiswidth > precision))
 423         {
 424           *nchars = i;
 425           *nbytes = i_byte;
 426           return width;
 427         }
 428       i++;
 429       i_byte += bytes;
 430       width += thiswidth;
 431   }
 432
 433   if (precision > 0)
 434     {
 435       *nchars = i;
 436       *nbytes = i_byte;
 437     }
 438
 439   return width;
 440 }
 441
 442 /* Return width of string STR of length LEN when displayed in the
 443    current buffer.  The width is measured by how many columns it
 444    occupies on the screen.  */
 445
 446 int
 447 strwidth (str, len)
 448      unsigned char *str;
 449      int len;
 450 {
 451   return c_string_width (str, len, -1, NULL, NULL);
 452 }
 453
 454 /* Return width of Lisp string STRING when displayed in the current
 455    buffer.  The width is measured by how many columns it occupies on
 456    the screen while paying attention to compositions.  If PRECISION >
 457    0, return the width of longest substring that doesn't exceed
 458    PRECISION, and set number of characters and bytes of the substring
 459    in *NCHARS and *NBYTES respectively.  */
 460
 461 int
 462 lisp_string_width (string, precision, nchars, nbytes)
 463      Lisp_Object string;
 464      int precision, *nchars, *nbytes;
 465 {
 466   int len = SCHARS (string);
 467   /* This set multibyte to 0 even if STRING is multibyte when it
 468      contains only ascii and eight-bit-graphic, but that's
 469      intentional.  */
 470   int multibyte = len < SBYTES (string);
 471   unsigned char *str = SDATA (string);
 472   int i = 0, i_byte = 0;
 473   int width = 0;
 474   struct Lisp_Char_Table *dp = buffer_display_table ();
 475
 476   while (i < len)
 477     {
 478       int chars, bytes, thiswidth;
 479       Lisp_Object val;
 480       int cmp_id;
 481       EMACS_INT ignore, end;
 482
 483       if (find_composition (i, -1, &ignore, &end, &val, string)
 484           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 485               >= 0))
 486         {
 487           thiswidth = composition_table[cmp_id]->width;
 488           chars = end - i;
 489           bytes = string_char_to_byte (string, end) - i_byte;
 490         }
 491       else
 492         {
 493           int c;
 494
 495           if (multibyte)
 496             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 497           else
 498             c = str[i_byte], bytes = 1;
 499           chars = 1;
 500           if (dp)
 501             {
 502               val = DISP_CHAR_VECTOR (dp, c);
 503               if (VECTORP (val))
 504                 thiswidth = XVECTOR (val)->size;
 505               else
 506                 thiswidth = CHAR_WIDTH (c);
 507             }
 508           else
 509             {
 510               thiswidth = CHAR_WIDTH (c);
 511             }
 512         }
 513
 514       if (precision > 0
 515           && (width + thiswidth > precision))
 516         {
 517           *nchars = i;
 518           *nbytes = i_byte;
 519           return width;
 520         }
 521       i += chars;
 522       i_byte += bytes;
 523       width += thiswidth;
 524   }
 525
 526   if (precision > 0)
 527     {
 528       *nchars = i;
 529       *nbytes = i_byte;
 530     }
 531
 532   return width;
 533 }
 534
 535 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 536        doc: /* Return width of STRING when displayed in the current buffer.
 537 Width is measured by how many columns it occupies on the screen.
 538 When calculating width of a multibyte character in STRING,
 539 only the base leading-code is considered; the validity of
 540 the following bytes is not checked.  Tabs in STRING are always
 541 taken to occupy `tab-width' columns.
 542 usage: (string-width STRING)  */)
 543      (str)
 544      Lisp_Object str;
 545 {
 546   Lisp_Object val;
 547
 548   CHECK_STRING (str);
 549   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 550   return val;
 551 }
 552
 553 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 554        doc: /* Return the direction of CHAR.
 555 The returned value is 0 for left-to-right and 1 for right-to-left.
 556 usage: (char-direction CHAR)  */)
 557      (ch)
 558      Lisp_Object ch;
 559 {
 560   int c;
 561
 562   CHECK_CHARACTER (ch);
 563   c = XINT (ch);
 564   return CHAR_TABLE_REF (Vchar_direction_table, c);
 565 }
 566
 567 /* Return the number of characters in the NBYTES bytes at PTR.
 568    This works by looking at the contents and checking for multibyte
 569    sequences while assuming that there's no invalid sequence.
 570    However, if the current buffer has enable-multibyte-characters =
 571    nil, we treat each byte as a character.  */
 572
 573 EMACS_INT
 574 chars_in_text (ptr, nbytes)
 575      const unsigned char *ptr;
 576      EMACS_INT nbytes;
 577 {
 578   /* current_buffer is null at early stages of Emacs initialization.  */
 579   if (current_buffer == 0
 580       || NILP (current_buffer->enable_multibyte_characters))
 581     return nbytes;
 582
 583   return multibyte_chars_in_text (ptr, nbytes);
 584 }
 585
 586 /* Return the number of characters in the NBYTES bytes at PTR.
 587    This works by looking at the contents and checking for multibyte
 588    sequences while assuming that there's no invalid sequence.  It
 589    ignores enable-multibyte-characters.  */
 590
 591 EMACS_INT
 592 multibyte_chars_in_text (ptr, nbytes)
 593      const unsigned char *ptr;
 594      EMACS_INT nbytes;
 595 {
 596   const unsigned char *endp = ptr + nbytes;
 597   int chars = 0;
 598
 599   while (ptr < endp)
 600     {
 601       int len = MULTIBYTE_LENGTH (ptr, endp);
 602
 603       if (len == 0)
 604         abort ();
 605       ptr += len;
 606       chars++;
 607     }
 608
 609   return chars;
 610 }
 611
 612 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 613    characters and bytes in it, and store them in *NCHARS and *NBYTES
 614    respectively.  On counting bytes, pay attention to that 8-bit
 615    characters not constructing a valid multibyte sequence are
 616    represented by 2-byte in a multibyte text.  */
 617
 618 void
 619 parse_str_as_multibyte (str, len, nchars, nbytes)
 620      const unsigned char *str;
 621      int len, *nchars, *nbytes;
 622 {
 623   const unsigned char *endp = str + len;
 624   int n, chars = 0, bytes = 0;
 625
 626   if (len >= MAX_MULTIBYTE_LENGTH)
 627     {
 628       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 629       while (str < adjusted_endp)
 630         {
 631           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 632             str += n, bytes += n;
 633           else
 634             str++, bytes += 2;
 635           chars++;
 636         }
 637     }
 638   while (str < endp)
 639     {
 640       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 641         str += n, bytes += n;
 642       else
 643         str++, bytes += 2;
 644       chars++;
 645     }
 646
 647   *nchars = chars;
 648   *nbytes = bytes;
 649   return;
 650 }
 651
 652 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 653    It actually converts only such 8-bit characters that don't contruct
 654    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 655    NCHARS is nonzero, set *NCHARS to the number of characters in the
 656    text.  It is assured that we can use LEN bytes at STR as a work
 657    area and that is enough.  Return the number of bytes of the
 658    resulting text.  */
 659
 660 int
 661 str_as_multibyte (str, len, nbytes, nchars)
 662      unsigned char *str;
 663      int len, nbytes, *nchars;
 664 {
 665   unsigned char *p = str, *endp = str + nbytes;
 666   unsigned char *to;
 667   int chars = 0;
 668   int n;
 669
 670   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 671     {
 672       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 673       while (p < adjusted_endp
 674              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 675         p += n, chars++;
 676     }
 677   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 678     p += n, chars++;
 679   if (nchars)
 680     *nchars = chars;
 681   if (p == endp)
 682     return nbytes;
 683
 684   to = p;
 685   nbytes = endp - p;
 686   endp = str + len;
 687   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 688   p = endp - nbytes;
 689
 690   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 691     {
 692       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 693       while (p < adjusted_endp)
 694         {
 695           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 696             {
 697               while (n--)
 698                 *to++ = *p++;
 699             }
 700           else
 701             {
 702               int c = *p++;
 703               c = BYTE8_TO_CHAR (c);
 704               to += CHAR_STRING (c, to);
 705             }
 706         }
 707       chars++;
 708     }
 709   while (p < endp)
 710     {
 711       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 712         {
 713           while (n--)
 714             *to++ = *p++;
 715         }
 716       else
 717         {
 718           int c = *p++;
 719           c = BYTE8_TO_CHAR (c);
 720           to += CHAR_STRING (c, to);
 721         }
 722       chars++;
 723     }
 724   if (nchars)
 725     *nchars = chars;
 726   return (to - str);
 727 }
 728
 729 /* Parse unibyte string at STR of LEN bytes, and return the number of
 730    bytes it may ocupy when converted to multibyte string by
 731    `str_to_multibyte'.  */
 732
 733 int
 734 parse_str_to_multibyte (str, len)
 735      unsigned char *str;
 736      int len;
 737 {
 738   unsigned char *endp = str + len;
 739   int bytes;
 740
 741   for (bytes = 0; str < endp; str++)
 742     bytes += (*str < 0x80) ? 1 : 2;
 743   return bytes;
 744 }
 745
 746
 747 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 748    that contains the same single-byte characters.  It actually
 749    converts all 8-bit characters to multibyte forms.  It is assured
 750    that we can use LEN bytes at STR as a work area and that is
 751    enough.  */
 752
 753 int
 754 str_to_multibyte (str, len, bytes)
 755      unsigned char *str;
 756      int len, bytes;
 757 {
 758   unsigned char *p = str, *endp = str + bytes;
 759   unsigned char *to;
 760
 761   while (p < endp && *p < 0x80) p++;
 762   if (p == endp)
 763     return bytes;
 764   to = p;
 765   bytes = endp - p;
 766   endp = str + len;
 767   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 768   p = endp - bytes;
 769   while (p < endp)
 770     {
 771       int c = *p++;
 772
 773       if (c >= 0x80)
 774         c = BYTE8_TO_CHAR (c);
 775       to += CHAR_STRING (c, to);
 776     }
 777   return (to - str);
 778 }
 779
 780 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 781    actually converts characters in the range 0x80..0xFF to
 782    unibyte.  */
 783
 784 int
 785 str_as_unibyte (str, bytes)
 786      unsigned char *str;
 787      int bytes;
 788 {
 789   const unsigned char *p = str, *endp = str + bytes;
 790   unsigned char *to;
 791   int c, len;
 792
 793   while (p < endp)
 794     {
 795       c = *p;
 796       len = BYTES_BY_CHAR_HEAD (c);
 797       if (CHAR_BYTE8_HEAD_P (c))
 798         break;
 799       p += len;
 800     }
 801   to = str + (p - str);
 802   while (p < endp)
 803     {
 804       c = *p;
 805       len = BYTES_BY_CHAR_HEAD (c);
 806       if (CHAR_BYTE8_HEAD_P (c))
 807         {
 808           c = STRING_CHAR_ADVANCE (p);
 809           *to++ = CHAR_TO_BYTE8 (c);
 810         }
 811       else
 812         {
 813           while (len--) *to++ = *p++;
 814         }
 815     }
 816   return (to - str);
 817 }
 818
 819 /* Convert eight-bit chars in SRC (in multibyte form) to the
 820    corresponding byte and store in DST.  CHARS is the number of
 821    characters in SRC.  The value is the number of bytes stored in DST.
 822    Usually, the value is the same as CHARS, but is less than it if SRC
 823    contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
 824    is nonzero, a Latin-1 character is accepted and converted to a byte
 825    of that character code.
 826    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 827
 828 EMACS_INT
 829 str_to_unibyte (src, dst, chars, accept_latin_1)
 830      const unsigned char *src;
 831      unsigned char *dst;
 832      EMACS_INT chars;
 833      int accept_latin_1;
 834 {
 835   EMACS_INT i;
 836
 837   for (i = 0; i < chars; i++)
 838     {
 839       int c = STRING_CHAR_ADVANCE (src);
 840
 841       if (CHAR_BYTE8_P (c))
 842         c = CHAR_TO_BYTE8 (c);
 843       else if (! ASCII_CHAR_P (c)
 844                && (! accept_latin_1 || c >= 0x100))
 845         return i;
 846       *dst++ = c;
 847     }
 848   return i;
 849 }
 850
 851
 852 int
 853 string_count_byte8 (string)
 854      Lisp_Object string;
 855 {
 856   int multibyte = STRING_MULTIBYTE (string);
 857   int nbytes = SBYTES (string);
 858   unsigned char *p = SDATA (string);
 859   unsigned char *pend = p + nbytes;
 860   int count = 0;
 861   int c, len;
 862
 863   if (multibyte)
 864     while (p < pend)
 865       {
 866         c = *p;
 867         len = BYTES_BY_CHAR_HEAD (c);
 868
 869         if (CHAR_BYTE8_HEAD_P (c))
 870           count++;
 871         p += len;
 872       }
 873   else
 874     while (p < pend)
 875       {
 876         if (*p++ >= 0x80)
 877           count++;
 878       }
 879   return count;
 880 }
 881
 882
 883 Lisp_Object
 884 string_escape_byte8 (string)
 885      Lisp_Object string;
 886 {
 887   int nchars = SCHARS (string);
 888   int nbytes = SBYTES (string);
 889   int multibyte = STRING_MULTIBYTE (string);
 890   int byte8_count;
 891   const unsigned char *src, *src_end;
 892   unsigned char *dst;
 893   Lisp_Object val;
 894   int c, len;
 895
 896   if (multibyte && nchars == nbytes)
 897     return string;
 898
 899   byte8_count = string_count_byte8 (string);
 900
 901   if (byte8_count == 0)
 902     return string;
 903
 904   if (multibyte)
 905     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 906     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 907                                         nbytes + byte8_count * 2);
 908   else
 909     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 910     val = make_uninit_string (nbytes + byte8_count * 3);
 911
 912   src = SDATA (string);
 913   src_end = src + nbytes;
 914   dst = SDATA (val);
 915   if (multibyte)
 916     while (src < src_end)
 917       {
 918         c = *src;
 919         len = BYTES_BY_CHAR_HEAD (c);
 920
 921         if (CHAR_BYTE8_HEAD_P (c))
 922           {
 923             c = STRING_CHAR_ADVANCE (src);
 924             c = CHAR_TO_BYTE8 (c);
 925             sprintf ((char *) dst, "\\%03o", c);
 926             dst += 4;
 927           }
 928         else
 929           while (len--) *dst++ = *src++;
 930       }
 931   else
 932     while (src < src_end)
 933       {
 934         c = *src++;
 935         if (c >= 0x80)
 936           {
 937             sprintf ((char *) dst, "\\%03o", c);
 938             dst += 4;
 939           }
 940         else
 941           *dst++ = c;
 942       }
 943   return val;
 944 }
 945
 946 \f
 947 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 948        doc: /*
 949 Concatenate all the argument characters and make the result a string.
 950 usage: (string &rest CHARACTERS)  */)
 951      (n, args)
 952      int n;
 953      Lisp_Object *args;
 954 {
 955   int i;
 956   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 957   unsigned char *p = buf;
 958   int c;
 959
 960   for (i = 0; i < n; i++)
 961     {
 962       CHECK_CHARACTER (args[i]);
 963       c = XINT (args[i]);
 964       p += CHAR_STRING (c, p);
 965     }
 966
 967   return make_string_from_bytes ((char *) buf, n, p - buf);
 968 }
 969
 970 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 971        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 972 usage: (unibyte-string &rest BYTES)  */)
 973      (n, args)
 974      int n;
 975      Lisp_Object *args;
 976 {
 977   int i;
 978   unsigned char *buf = (unsigned char *) alloca (n);
 979   unsigned char *p = buf;
 980   unsigned c;
 981
 982   for (i = 0; i < n; i++)
 983     {
 984       CHECK_NATNUM (args[i]);
 985       c = XFASTINT (args[i]);
 986       if (c >= 256)
 987         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 988       *p++ = c;
 989     }
 990
 991   return make_string_from_bytes ((char *) buf, n, p - buf);
 992 }
 993
 994 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 995        Schar_resolve_modifiers, 1, 1, 0,
 996        doc: /* Resolve modifiers in the character CHAR.
 997 The value is a character with modifiers resolved into the character
 998 code.  Unresolved modifiers are kept in the value.
 999 usage: (char-resolve-modifiers CHAR)  */)
1000      (character)
1001      Lisp_Object character;
1002 {
1003   int c;
1004
1005   CHECK_NUMBER (character);
1006   c = XINT (character);
1007   return make_number (char_resolve_modifier_mask (c));
1008 }
1009
1010 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1011        doc: /* Return a byte value of a character at point.
1012 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1013 a byte value.
1014 Optional 2nd arg STRING, if non-nil, is a string of which first
1015 character is a target to get a byte value.  In this case, POSITION, if
1016 non-nil, is an index of a target character in the string.
1017
1018 If the current buffer (or STRING) is multibyte, and the target
1019 character is not ASCII nor 8-bit character, an error is signalled.  */)
1020      (position, string)
1021      Lisp_Object position, string;
1022 {
1023   int c;
1024   EMACS_INT pos;
1025   unsigned char *p;
1026
1027   if (NILP (string))
1028     {
1029       if (NILP (position))
1030         {
1031           p = PT_ADDR;
1032         }
1033       else
1034         {
1035           CHECK_NUMBER_COERCE_MARKER (position);
1036           if (XINT (position) < BEGV || XINT (position) >= ZV)
1037             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1038           pos = XFASTINT (position);
1039           p = CHAR_POS_ADDR (pos);
1040         }
1041       if (NILP (current_buffer->enable_multibyte_characters))
1042         return make_number (*p);
1043     }
1044   else
1045     {
1046       CHECK_STRING (string);
1047       if (NILP (position))
1048         {
1049           p = SDATA (string);
1050         }
1051       else
1052         {
1053           CHECK_NATNUM (position);
1054           if (XINT (position) >= SCHARS (string))
1055             args_out_of_range (string, position);
1056           pos = XFASTINT (position);
1057           p = SDATA (string) + string_char_to_byte (string, pos);
1058         }
1059       if (! STRING_MULTIBYTE (string))
1060         return make_number (*p);
1061     }
1062   c = STRING_CHAR (p, 0);
1063   if (CHAR_BYTE8_P (c))
1064     c = CHAR_TO_BYTE8 (c);
1065   else if (! ASCII_CHAR_P (c))
1066     error ("Not an ASCII nor an 8-bit character: %d", c);
1067   return make_number (c);
1068 }
1069
1070
1071 void
1072 init_character_once ()
1073 {
1074 }
1075
1076 #ifdef emacs
1077
1078 void
1079 syms_of_character ()
1080 {
1081   DEFSYM (Qcharacterp, "characterp");
1082   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1083
1084   staticpro (&Vchar_unify_table);
1085   Vchar_unify_table = Qnil;
1086
1087   defsubr (&Smax_char);
1088   defsubr (&Scharacterp);
1089   defsubr (&Sunibyte_char_to_multibyte);
1090   defsubr (&Smultibyte_char_to_unibyte);
1091   defsubr (&Schar_bytes);
1092   defsubr (&Schar_width);
1093   defsubr (&Sstring_width);
1094   defsubr (&Schar_direction);
1095   defsubr (&Sstring);
1096   defsubr (&Sunibyte_string);
1097   defsubr (&Schar_resolve_modifiers);
1098   defsubr (&Sget_byte);
1099
1100   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1101                doc: /*
1102 Vector recording all translation tables ever defined.
1103 Each element is a pair (SYMBOL . TABLE) relating the table to the
1104 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1105   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1106
1107   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1108                doc: /*
1109 A char-table for characters which invoke auto-filling.
1110 Such characters have value t in this table.  */);
1111   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1112   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1113   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1114
1115   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1116                doc: /*
1117 A char-table for width (columns) of each character.  */);
1118   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1119   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1120   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1121                         make_number (4));
1122
1123   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1124                doc: /* A char-table for direction of each character.  */);
1125   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1126
1127   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1128                doc: /* A char-table for each printable character.  */);
1129   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1130   Fset_char_table_range (Vprintable_chars,
1131                          Fcons (make_number (32), make_number (126)), Qt);
1132   Fset_char_table_range (Vprintable_chars,
1133                          Fcons (make_number (160),
1134                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1135
1136   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1137                doc: /* Char table of script symbols.
1138 It has one extra slot whose value is a list of script symbols.  */);
1139
1140   /* Intern this now in case it isn't already done.
1141      Setting this variable twice is harmless.
1142      But don't staticpro it here--that is done in alloc.c.  */
1143   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1144   DEFSYM (Qchar_script_table, "char-script-table");
1145   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1146   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1147
1148   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1149                doc: /* Alist of scripts vs the representative characters.
1150 Each element is a cons (SCRIPT . CHARS).
1151 SCRIPT is a symbol representing a script or a subgroup of a script.
1152 CHARS is a list or a vector of characters.
1153 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1154 If it is a vector, one of the characters in the vector is necessary.
1155 This variable is used to find a font for a specific script.  */);
1156   Vscript_representative_chars = Qnil;
1157
1158   DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1159                doc: /* Char table of Unicode's "General Category".
1160 All Unicode characters have one of the following values (symbol):
1161   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1162   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1163 See The Unicode Standard for the meaning of those values.  */);
1164   /* The correct char-table is setup in characters.el.  */
1165   Vunicode_category_table = Qnil;
1166 }
1167
1168 #endif /* emacs */
1169
1170 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1171    (do not change this comment) */