src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2011  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include <intprops.h>
  39 #include "lisp.h"
  40 #include "character.h"
  41 #include "buffer.h"
  42 #include "charset.h"
  43 #include "composite.h"
  44 #include "disptab.h"
  45
  46 #else  /* not emacs */
  47
  48 #include "mulelib.h"
  49
  50 #endif /* emacs */
  51
  52 Lisp_Object Qcharacterp;
  53
  54 static Lisp_Object Qauto_fill_chars;
  55
  56 /* Char-table of information about which character to unify to which
  57    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  58 Lisp_Object Vchar_unify_table;
  59
  60 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  61 unsigned char *_fetch_multibyte_char_p;
  62
  63 static Lisp_Object Qchar_script_table;
  64
  65 \f
  66
  67 /* If character code C has modifier masks, reflect them to the
  68    character code if possible.  Return the resulting code.  */
  69
  70 int
  71 char_resolve_modifier_mask (int c)
  72 {
  73   /* A non-ASCII character can't reflect modifier bits to the code.  */
  74   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  75     return c;
  76
  77   /* For Meta, Shift, and Control modifiers, we need special care.  */
  78   if (c & CHAR_SHIFT)
  79     {
  80       /* Shift modifier is valid only with [A-Za-z].  */
  81       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  82         c &= ~CHAR_SHIFT;
  83       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  84         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  85       /* Shift modifier for control characters and SPC is ignored.  */
  86       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  87         c &= ~CHAR_SHIFT;
  88     }
  89   if (c & CHAR_CTL)
  90     {
  91       /* Simulate the code in lread.c.  */
  92       /* Allow `\C- ' and `\C-?'.  */
  93       if ((c & 0377) == ' ')
  94         c &= ~0177 & ~ CHAR_CTL;
  95       else if ((c & 0377) == '?')
  96         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  97       /* ASCII control chars are made from letters (both cases),
  98          as well as the non-letters within 0100...0137.  */
  99       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 100         c &= (037 | (~0177 & ~CHAR_CTL));
 101       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 102         c &= (037 | (~0177 & ~CHAR_CTL));
 103     }
 104 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 105   if (c & CHAR_META)
 106     {
 107       /* Move the meta bit to the right place for a string.  */
 108       c = (c & ~CHAR_META) | 0x80;
 109     }
 110 #endif
 111
 112   return c;
 113 }
 114
 115
 116 /* Store multibyte form of character C at P.  If C has modifier bits,
 117    handle them appropriately.  */
 118
 119 int
 120 char_string (unsigned int c, unsigned char *p)
 121 {
 122   int bytes;
 123
 124   if (c & CHAR_MODIFIER_MASK)
 125     {
 126       c = (unsigned) char_resolve_modifier_mask ((int) c);
 127       /* If C still has any modifier bits, just ignore it.  */
 128       c &= ~CHAR_MODIFIER_MASK;
 129     }
 130
 131   MAYBE_UNIFY_CHAR (c);
 132
 133   if (c <= MAX_3_BYTE_CHAR)
 134     {
 135       bytes = CHAR_STRING (c, p);
 136     }
 137   else if (c <= MAX_4_BYTE_CHAR)
 138     {
 139       p[0] = (0xF0 | (c >> 18));
 140       p[1] = (0x80 | ((c >> 12) & 0x3F));
 141       p[2] = (0x80 | ((c >> 6) & 0x3F));
 142       p[3] = (0x80 | (c & 0x3F));
 143       bytes = 4;
 144     }
 145   else if (c <= MAX_5_BYTE_CHAR)
 146     {
 147       p[0] = 0xF8;
 148       p[1] = (0x80 | ((c >> 18) & 0x0F));
 149       p[2] = (0x80 | ((c >> 12) & 0x3F));
 150       p[3] = (0x80 | ((c >> 6) & 0x3F));
 151       p[4] = (0x80 | (c & 0x3F));
 152       bytes = 5;
 153     }
 154   else if (c <= MAX_CHAR)
 155     {
 156       c = CHAR_TO_BYTE8 (c);
 157       bytes = BYTE8_STRING (c, p);
 158     }
 159   else
 160     error ("Invalid character: %x", c);
 161
 162   return bytes;
 163 }
 164
 165
 166 /* Return a character whose multibyte form is at P.  If LEN is not
 167    NULL, it must be a pointer to integer.  In that case, set *LEN to
 168    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 169    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 170    the ending address (i.e., the starting address of the next
 171    character) of the multibyte form.  */
 172
 173 int
 174 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 175 {
 176   int c;
 177   const unsigned char *saved_p = p;
 178
 179   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 180     {
 181       c = STRING_CHAR_ADVANCE (p);
 182     }
 183   else if (! (*p & 0x08))
 184     {
 185       c = ((((p)[0] & 0xF) << 18)
 186            | (((p)[1] & 0x3F) << 12)
 187            | (((p)[2] & 0x3F) << 6)
 188            | ((p)[3] & 0x3F));
 189       p += 4;
 190     }
 191   else
 192     {
 193       c = ((((p)[1] & 0x3F) << 18)
 194            | (((p)[2] & 0x3F) << 12)
 195            | (((p)[3] & 0x3F) << 6)
 196            | ((p)[4] & 0x3F));
 197       p += 5;
 198     }
 199
 200   MAYBE_UNIFY_CHAR (c);
 201
 202   if (len)
 203     *len = p - saved_p;
 204   if (advanced)
 205     *advanced = p;
 206   return c;
 207 }
 208
 209
 210 /* Translate character C by translation table TABLE.  If no translation is
 211    found in TABLE, return the untranslated character.  If TABLE is a list,
 212    elements are char tables.  In that case, recursively translate C by all the
 213    tables in the list.  */
 214
 215 int
 216 translate_char (Lisp_Object table, int c)
 217 {
 218   if (CHAR_TABLE_P (table))
 219     {
 220       Lisp_Object ch;
 221
 222       ch = CHAR_TABLE_REF (table, c);
 223       if (CHARACTERP (ch))
 224         c = XINT (ch);
 225     }
 226   else
 227     {
 228       for (; CONSP (table); table = XCDR (table))
 229         c = translate_char (XCAR (table), c);
 230     }
 231   return c;
 232 }
 233
 234 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 235    them, return (C & 0xFF).  */
 236
 237 int
 238 multibyte_char_to_unibyte (int c)
 239 {
 240   if (c < 0x80)
 241     return c;
 242   if (CHAR_BYTE8_P (c))
 243     return CHAR_TO_BYTE8 (c);
 244   return (c & 0xFF);
 245 }
 246
 247 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 248    by charset_unibyte.  */
 249
 250 int
 251 multibyte_char_to_unibyte_safe (int c)
 252 {
 253   if (c < 0x80)
 254     return c;
 255   if (CHAR_BYTE8_P (c))
 256     return CHAR_TO_BYTE8 (c);
 257   return -1;
 258 }
 259
 260 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 261        doc: /* Return non-nil if OBJECT is a character.  */)
 262   (Lisp_Object object, Lisp_Object ignore)
 263 {
 264   return (CHARACTERP (object) ? Qt : Qnil);
 265 }
 266
 267 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 268        doc: /* Return the character of the maximum code.  */)
 269   (void)
 270 {
 271   return make_number (MAX_CHAR);
 272 }
 273
 274 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 275        Sunibyte_char_to_multibyte, 1, 1, 0,
 276        doc: /* Convert the byte CH to multibyte character.  */)
 277   (Lisp_Object ch)
 278 {
 279   int c;
 280
 281   CHECK_CHARACTER (ch);
 282   c = XFASTINT (ch);
 283   if (c >= 0x100)
 284     error ("Not a unibyte character: %d", c);
 285   MAKE_CHAR_MULTIBYTE (c);
 286   return make_number (c);
 287 }
 288
 289 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 290        Smultibyte_char_to_unibyte, 1, 1, 0,
 291        doc: /* Convert the multibyte character CH to a byte.
 292 If the multibyte character does not represent a byte, return -1.  */)
 293   (Lisp_Object ch)
 294 {
 295   int cm;
 296
 297   CHECK_CHARACTER (ch);
 298   cm = XFASTINT (ch);
 299   if (cm < 256)
 300     /* Can't distinguish a byte read from a unibyte buffer from
 301        a latin1 char, so let's let it slide.  */
 302     return ch;
 303   else
 304     {
 305       int cu = CHAR_TO_BYTE_SAFE (cm);
 306       return make_number (cu);
 307     }
 308 }
 309
 310 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 311        doc: /* Return width of CHAR when displayed in the current buffer.
 312 The width is measured by how many columns it occupies on the screen.
 313 Tab is taken to occupy `tab-width' columns.
 314 usage: (char-width CHAR)  */)
 315   (Lisp_Object ch)
 316 {
 317   Lisp_Object disp;
 318   int c, width;
 319   struct Lisp_Char_Table *dp = buffer_display_table ();
 320
 321   CHECK_CHARACTER (ch);
 322   c = XINT (ch);
 323
 324   /* Get the way the display table would display it.  */
 325   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 326
 327   if (VECTORP (disp))
 328     width = ASIZE (disp);
 329   else
 330     width = CHAR_WIDTH (c);
 331
 332   return make_number (width);
 333 }
 334
 335 /* Return width of string STR of length LEN when displayed in the
 336    current buffer.  The width is measured by how many columns it
 337    occupies on the screen.  If PRECISION > 0, return the width of
 338    longest substring that doesn't exceed PRECISION, and set number of
 339    characters and bytes of the substring in *NCHARS and *NBYTES
 340    respectively.  */
 341
 342 EMACS_INT
 343 c_string_width (const unsigned char *str, EMACS_INT len, int precision,
 344                 EMACS_INT *nchars, EMACS_INT *nbytes)
 345 {
 346   EMACS_INT i = 0, i_byte = 0;
 347   EMACS_INT width = 0;
 348   struct Lisp_Char_Table *dp = buffer_display_table ();
 349
 350   while (i_byte < len)
 351     {
 352       int bytes, thiswidth;
 353       Lisp_Object val;
 354       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 355
 356       if (dp)
 357         {
 358           val = DISP_CHAR_VECTOR (dp, c);
 359           if (VECTORP (val))
 360             thiswidth = ASIZE (val);
 361           else
 362             thiswidth = CHAR_WIDTH (c);
 363         }
 364       else
 365         {
 366           thiswidth = CHAR_WIDTH (c);
 367         }
 368
 369       if (precision > 0
 370           && (width + thiswidth > precision))
 371         {
 372           *nchars = i;
 373           *nbytes = i_byte;
 374           return width;
 375         }
 376       i++;
 377       i_byte += bytes;
 378       width += thiswidth;
 379   }
 380
 381   if (precision > 0)
 382     {
 383       *nchars = i;
 384       *nbytes = i_byte;
 385     }
 386
 387   return width;
 388 }
 389
 390 /* Return width of string STR of length LEN when displayed in the
 391    current buffer.  The width is measured by how many columns it
 392    occupies on the screen.  */
 393
 394 EMACS_INT
 395 strwidth (const char *str, EMACS_INT len)
 396 {
 397   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 398 }
 399
 400 /* Return width of Lisp string STRING when displayed in the current
 401    buffer.  The width is measured by how many columns it occupies on
 402    the screen while paying attention to compositions.  If PRECISION >
 403    0, return the width of longest substring that doesn't exceed
 404    PRECISION, and set number of characters and bytes of the substring
 405    in *NCHARS and *NBYTES respectively.  */
 406
 407 EMACS_INT
 408 lisp_string_width (Lisp_Object string, EMACS_INT precision,
 409                    EMACS_INT *nchars, EMACS_INT *nbytes)
 410 {
 411   EMACS_INT len = SCHARS (string);
 412   /* This set multibyte to 0 even if STRING is multibyte when it
 413      contains only ascii and eight-bit-graphic, but that's
 414      intentional.  */
 415   int multibyte = len < SBYTES (string);
 416   unsigned char *str = SDATA (string);
 417   EMACS_INT i = 0, i_byte = 0;
 418   EMACS_INT width = 0;
 419   struct Lisp_Char_Table *dp = buffer_display_table ();
 420
 421   while (i < len)
 422     {
 423       EMACS_INT chars, bytes, thiswidth;
 424       Lisp_Object val;
 425       int cmp_id;
 426       EMACS_INT ignore, end;
 427
 428       if (find_composition (i, -1, &ignore, &end, &val, string)
 429           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 430               >= 0))
 431         {
 432           thiswidth = composition_table[cmp_id]->width;
 433           chars = end - i;
 434           bytes = string_char_to_byte (string, end) - i_byte;
 435         }
 436       else
 437         {
 438           int c;
 439
 440           if (multibyte)
 441             {
 442               int cbytes;
 443               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 444               bytes = cbytes;
 445             }
 446           else
 447             c = str[i_byte], bytes = 1;
 448           chars = 1;
 449           if (dp)
 450             {
 451               val = DISP_CHAR_VECTOR (dp, c);
 452               if (VECTORP (val))
 453                 thiswidth = ASIZE (val);
 454               else
 455                 thiswidth = CHAR_WIDTH (c);
 456             }
 457           else
 458             {
 459               thiswidth = CHAR_WIDTH (c);
 460             }
 461         }
 462
 463       if (precision <= 0)
 464         {
 465 #ifdef emacs
 466           if (INT_ADD_OVERFLOW (width, thiswidth))
 467             string_overflow ();
 468 #endif
 469         }
 470       else if (precision - width < thiswidth)
 471         {
 472           *nchars = i;
 473           *nbytes = i_byte;
 474           return width;
 475         }
 476       i += chars;
 477       i_byte += bytes;
 478       width += thiswidth;
 479     }
 480
 481   if (precision > 0)
 482     {
 483       *nchars = i;
 484       *nbytes = i_byte;
 485     }
 486
 487   return width;
 488 }
 489
 490 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 491        doc: /* Return width of STRING when displayed in the current buffer.
 492 Width is measured by how many columns it occupies on the screen.
 493 When calculating width of a multibyte character in STRING,
 494 only the base leading-code is considered; the validity of
 495 the following bytes is not checked.  Tabs in STRING are always
 496 taken to occupy `tab-width' columns.
 497 usage: (string-width STRING)  */)
 498   (Lisp_Object str)
 499 {
 500   Lisp_Object val;
 501
 502   CHECK_STRING (str);
 503   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 504   return val;
 505 }
 506
 507 /* Return the number of characters in the NBYTES bytes at PTR.
 508    This works by looking at the contents and checking for multibyte
 509    sequences while assuming that there's no invalid sequence.
 510    However, if the current buffer has enable-multibyte-characters =
 511    nil, we treat each byte as a character.  */
 512
 513 EMACS_INT
 514 chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 515 {
 516   /* current_buffer is null at early stages of Emacs initialization.  */
 517   if (current_buffer == 0
 518       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 519     return nbytes;
 520
 521   return multibyte_chars_in_text (ptr, nbytes);
 522 }
 523
 524 /* Return the number of characters in the NBYTES bytes at PTR.
 525    This works by looking at the contents and checking for multibyte
 526    sequences while assuming that there's no invalid sequence.  It
 527    ignores enable-multibyte-characters.  */
 528
 529 EMACS_INT
 530 multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
 531 {
 532   const unsigned char *endp = ptr + nbytes;
 533   EMACS_INT chars = 0;
 534
 535   while (ptr < endp)
 536     {
 537       EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
 538
 539       if (len == 0)
 540         abort ();
 541       ptr += len;
 542       chars++;
 543     }
 544
 545   return chars;
 546 }
 547
 548 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 549    characters and bytes in it, and store them in *NCHARS and *NBYTES
 550    respectively.  On counting bytes, pay attention to that 8-bit
 551    characters not constructing a valid multibyte sequence are
 552    represented by 2-byte in a multibyte text.  */
 553
 554 void
 555 parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
 556                         EMACS_INT *nchars, EMACS_INT *nbytes)
 557 {
 558   const unsigned char *endp = str + len;
 559   EMACS_INT n, chars = 0, bytes = 0;
 560
 561   if (len >= MAX_MULTIBYTE_LENGTH)
 562     {
 563       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 564       while (str < adjusted_endp)
 565         {
 566           if (! CHAR_BYTE8_HEAD_P (*str)
 567               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 568             str += n, bytes += n;
 569           else
 570             str++, bytes += 2;
 571           chars++;
 572         }
 573     }
 574   while (str < endp)
 575     {
 576       if (! CHAR_BYTE8_HEAD_P (*str)
 577           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 578         str += n, bytes += n;
 579       else
 580         str++, bytes += 2;
 581       chars++;
 582     }
 583
 584   *nchars = chars;
 585   *nbytes = bytes;
 586   return;
 587 }
 588
 589 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 590    It actually converts only such 8-bit characters that don't contruct
 591    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 592    NCHARS is nonzero, set *NCHARS to the number of characters in the
 593    text.  It is assured that we can use LEN bytes at STR as a work
 594    area and that is enough.  Return the number of bytes of the
 595    resulting text.  */
 596
 597 EMACS_INT
 598 str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
 599                   EMACS_INT *nchars)
 600 {
 601   unsigned char *p = str, *endp = str + nbytes;
 602   unsigned char *to;
 603   EMACS_INT chars = 0;
 604   int n;
 605
 606   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 607     {
 608       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 609       while (p < adjusted_endp
 610              && ! CHAR_BYTE8_HEAD_P (*p)
 611              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 612         p += n, chars++;
 613     }
 614   while (p < endp
 615          && ! CHAR_BYTE8_HEAD_P (*p)
 616          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 617     p += n, chars++;
 618   if (nchars)
 619     *nchars = chars;
 620   if (p == endp)
 621     return nbytes;
 622
 623   to = p;
 624   nbytes = endp - p;
 625   endp = str + len;
 626   memmove (endp - nbytes, p, nbytes);
 627   p = endp - nbytes;
 628
 629   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 630     {
 631       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 632       while (p < adjusted_endp)
 633         {
 634           if (! CHAR_BYTE8_HEAD_P (*p)
 635               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 636             {
 637               while (n--)
 638                 *to++ = *p++;
 639             }
 640           else
 641             {
 642               int c = *p++;
 643               c = BYTE8_TO_CHAR (c);
 644               to += CHAR_STRING (c, to);
 645             }
 646         }
 647       chars++;
 648     }
 649   while (p < endp)
 650     {
 651       if (! CHAR_BYTE8_HEAD_P (*p)
 652           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 653         {
 654           while (n--)
 655             *to++ = *p++;
 656         }
 657       else
 658         {
 659           int c = *p++;
 660           c = BYTE8_TO_CHAR (c);
 661           to += CHAR_STRING (c, to);
 662         }
 663       chars++;
 664     }
 665   if (nchars)
 666     *nchars = chars;
 667   return (to - str);
 668 }
 669
 670 /* Parse unibyte string at STR of LEN bytes, and return the number of
 671    bytes it may ocupy when converted to multibyte string by
 672    `str_to_multibyte'.  */
 673
 674 EMACS_INT
 675 count_size_as_multibyte (const unsigned char *str, EMACS_INT len)
 676 {
 677   const unsigned char *endp = str + len;
 678   EMACS_INT bytes;
 679
 680   for (bytes = 0; str < endp; str++)
 681     {
 682       int n = *str < 0x80 ? 1 : 2;
 683       if (INT_ADD_OVERFLOW (bytes, n))
 684         string_overflow ();
 685       bytes += n;
 686     }
 687   return bytes;
 688 }
 689
 690
 691 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 692    that contains the same single-byte characters.  It actually
 693    converts all 8-bit characters to multibyte forms.  It is assured
 694    that we can use LEN bytes at STR as a work area and that is
 695    enough.  */
 696
 697 EMACS_INT
 698 str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
 699 {
 700   unsigned char *p = str, *endp = str + bytes;
 701   unsigned char *to;
 702
 703   while (p < endp && *p < 0x80) p++;
 704   if (p == endp)
 705     return bytes;
 706   to = p;
 707   bytes = endp - p;
 708   endp = str + len;
 709   memmove (endp - bytes, p, bytes);
 710   p = endp - bytes;
 711   while (p < endp)
 712     {
 713       int c = *p++;
 714
 715       if (c >= 0x80)
 716         c = BYTE8_TO_CHAR (c);
 717       to += CHAR_STRING (c, to);
 718     }
 719   return (to - str);
 720 }
 721
 722 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 723    actually converts characters in the range 0x80..0xFF to
 724    unibyte.  */
 725
 726 EMACS_INT
 727 str_as_unibyte (unsigned char *str, EMACS_INT bytes)
 728 {
 729   const unsigned char *p = str, *endp = str + bytes;
 730   unsigned char *to;
 731   int c, len;
 732
 733   while (p < endp)
 734     {
 735       c = *p;
 736       len = BYTES_BY_CHAR_HEAD (c);
 737       if (CHAR_BYTE8_HEAD_P (c))
 738         break;
 739       p += len;
 740     }
 741   to = str + (p - str);
 742   while (p < endp)
 743     {
 744       c = *p;
 745       len = BYTES_BY_CHAR_HEAD (c);
 746       if (CHAR_BYTE8_HEAD_P (c))
 747         {
 748           c = STRING_CHAR_ADVANCE (p);
 749           *to++ = CHAR_TO_BYTE8 (c);
 750         }
 751       else
 752         {
 753           while (len--) *to++ = *p++;
 754         }
 755     }
 756   return (to - str);
 757 }
 758
 759 /* Convert eight-bit chars in SRC (in multibyte form) to the
 760    corresponding byte and store in DST.  CHARS is the number of
 761    characters in SRC.  The value is the number of bytes stored in DST.
 762    Usually, the value is the same as CHARS, but is less than it if SRC
 763    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 764    is nonzero, a Latin-1 character is accepted and converted to a byte
 765    of that character code.
 766    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 767
 768 EMACS_INT
 769 str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
 770 {
 771   EMACS_INT i;
 772
 773   for (i = 0; i < chars; i++)
 774     {
 775       int c = STRING_CHAR_ADVANCE (src);
 776
 777       if (CHAR_BYTE8_P (c))
 778         c = CHAR_TO_BYTE8 (c);
 779       else if (! ASCII_CHAR_P (c)
 780                && (! accept_latin_1 || c >= 0x100))
 781         return i;
 782       *dst++ = c;
 783     }
 784   return i;
 785 }
 786
 787
 788 static EMACS_INT
 789 string_count_byte8 (Lisp_Object string)
 790 {
 791   int multibyte = STRING_MULTIBYTE (string);
 792   EMACS_INT nbytes = SBYTES (string);
 793   unsigned char *p = SDATA (string);
 794   unsigned char *pend = p + nbytes;
 795   EMACS_INT count = 0;
 796   int c, len;
 797
 798   if (multibyte)
 799     while (p < pend)
 800       {
 801         c = *p;
 802         len = BYTES_BY_CHAR_HEAD (c);
 803
 804         if (CHAR_BYTE8_HEAD_P (c))
 805           count++;
 806         p += len;
 807       }
 808   else
 809     while (p < pend)
 810       {
 811         if (*p++ >= 0x80)
 812           count++;
 813       }
 814   return count;
 815 }
 816
 817
 818 Lisp_Object
 819 string_escape_byte8 (Lisp_Object string)
 820 {
 821   EMACS_INT nchars = SCHARS (string);
 822   EMACS_INT nbytes = SBYTES (string);
 823   int multibyte = STRING_MULTIBYTE (string);
 824   EMACS_INT byte8_count;
 825   const unsigned char *src, *src_end;
 826   unsigned char *dst;
 827   Lisp_Object val;
 828   int c, len;
 829
 830   if (multibyte && nchars == nbytes)
 831     return string;
 832
 833   byte8_count = string_count_byte8 (string);
 834
 835   if (byte8_count == 0)
 836     return string;
 837
 838   if (multibyte)
 839     {
 840       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 841           || (MOST_POSITIVE_FIXNUM - nbytes) / 2 < byte8_count)
 842         string_overflow ();
 843
 844       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 845       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 846                                           nbytes + byte8_count * 2);
 847     }
 848   else
 849     {
 850       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count)
 851         string_overflow ();
 852
 853       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 854       val = make_uninit_string (nbytes + byte8_count * 3);
 855     }
 856
 857   src = SDATA (string);
 858   src_end = src + nbytes;
 859   dst = SDATA (val);
 860   if (multibyte)
 861     while (src < src_end)
 862       {
 863         c = *src;
 864         len = BYTES_BY_CHAR_HEAD (c);
 865
 866         if (CHAR_BYTE8_HEAD_P (c))
 867           {
 868             c = STRING_CHAR_ADVANCE (src);
 869             c = CHAR_TO_BYTE8 (c);
 870             sprintf ((char *) dst, "\\%03o", c);
 871             dst += 4;
 872           }
 873         else
 874           while (len--) *dst++ = *src++;
 875       }
 876   else
 877     while (src < src_end)
 878       {
 879         c = *src++;
 880         if (c >= 0x80)
 881           {
 882             sprintf ((char *) dst, "\\%03o", c);
 883             dst += 4;
 884           }
 885         else
 886           *dst++ = c;
 887       }
 888   return val;
 889 }
 890
 891 \f
 892 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 893        doc: /*
 894 Concatenate all the argument characters and make the result a string.
 895 usage: (string &rest CHARACTERS)  */)
 896   (size_t n, Lisp_Object *args)
 897 {
 898   size_t i;
 899   int c;
 900   unsigned char *buf, *p;
 901   Lisp_Object str;
 902   USE_SAFE_ALLOCA;
 903
 904   SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
 905   p = buf;
 906
 907   for (i = 0; i < n; i++)
 908     {
 909       CHECK_CHARACTER (args[i]);
 910       c = XINT (args[i]);
 911       p += CHAR_STRING (c, p);
 912     }
 913
 914   str = make_string_from_bytes ((char *) buf, n, p - buf);
 915   SAFE_FREE ();
 916   return str;
 917 }
 918
 919 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 920        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 921 usage: (unibyte-string &rest BYTES)  */)
 922   (size_t n, Lisp_Object *args)
 923 {
 924   size_t i;
 925   int c;
 926   unsigned char *buf, *p;
 927   Lisp_Object str;
 928   USE_SAFE_ALLOCA;
 929
 930   SAFE_ALLOCA (buf, unsigned char *, n);
 931   p = buf;
 932
 933   for (i = 0; i < n; i++)
 934     {
 935       CHECK_NATNUM (args[i]);
 936       c = XFASTINT (args[i]);
 937       if (c >= 256)
 938         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 939       *p++ = c;
 940     }
 941
 942   str = make_string_from_bytes ((char *) buf, n, p - buf);
 943   SAFE_FREE ();
 944   return str;
 945 }
 946
 947 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 948        Schar_resolve_modifiers, 1, 1, 0,
 949        doc: /* Resolve modifiers in the character CHAR.
 950 The value is a character with modifiers resolved into the character
 951 code.  Unresolved modifiers are kept in the value.
 952 usage: (char-resolve-modifiers CHAR)  */)
 953   (Lisp_Object character)
 954 {
 955   int c;
 956
 957   CHECK_NUMBER (character);
 958   c = XINT (character);
 959   return make_number (char_resolve_modifier_mask (c));
 960 }
 961
 962 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 963        doc: /* Return a byte value of a character at point.
 964 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 965 a byte value.
 966 Optional 2nd arg STRING, if non-nil, is a string of which first
 967 character is a target to get a byte value.  In this case, POSITION, if
 968 non-nil, is an index of a target character in the string.
 969
 970 If the current buffer (or STRING) is multibyte, and the target
 971 character is not ASCII nor 8-bit character, an error is signalled.  */)
 972   (Lisp_Object position, Lisp_Object string)
 973 {
 974   int c;
 975   EMACS_INT pos;
 976   unsigned char *p;
 977
 978   if (NILP (string))
 979     {
 980       if (NILP (position))
 981         {
 982           p = PT_ADDR;
 983         }
 984       else
 985         {
 986           CHECK_NUMBER_COERCE_MARKER (position);
 987           if (XINT (position) < BEGV || XINT (position) >= ZV)
 988             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 989           pos = XFASTINT (position);
 990           p = CHAR_POS_ADDR (pos);
 991         }
 992       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 993         return make_number (*p);
 994     }
 995   else
 996     {
 997       CHECK_STRING (string);
 998       if (NILP (position))
 999         {
1000           p = SDATA (string);
1001         }
1002       else
1003         {
1004           CHECK_NATNUM (position);
1005           if (XINT (position) >= SCHARS (string))
1006             args_out_of_range (string, position);
1007           pos = XFASTINT (position);
1008           p = SDATA (string) + string_char_to_byte (string, pos);
1009         }
1010       if (! STRING_MULTIBYTE (string))
1011         return make_number (*p);
1012     }
1013   c = STRING_CHAR (p);
1014   if (CHAR_BYTE8_P (c))
1015     c = CHAR_TO_BYTE8 (c);
1016   else if (! ASCII_CHAR_P (c))
1017     error ("Not an ASCII nor an 8-bit character: %d", c);
1018   return make_number (c);
1019 }
1020
1021
1022 void
1023 init_character_once (void)
1024 {
1025 }
1026
1027 #ifdef emacs
1028
1029 void
1030 syms_of_character (void)
1031 {
1032   DEFSYM (Qcharacterp, "characterp");
1033   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1034
1035   staticpro (&Vchar_unify_table);
1036   Vchar_unify_table = Qnil;
1037
1038   defsubr (&Smax_char);
1039   defsubr (&Scharacterp);
1040   defsubr (&Sunibyte_char_to_multibyte);
1041   defsubr (&Smultibyte_char_to_unibyte);
1042   defsubr (&Schar_width);
1043   defsubr (&Sstring_width);
1044   defsubr (&Sstring);
1045   defsubr (&Sunibyte_string);
1046   defsubr (&Schar_resolve_modifiers);
1047   defsubr (&Sget_byte);
1048
1049   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1050                doc: /*
1051 Vector recording all translation tables ever defined.
1052 Each element is a pair (SYMBOL . TABLE) relating the table to the
1053 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1054   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1055
1056   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1057                doc: /*
1058 A char-table for characters which invoke auto-filling.
1059 Such characters have value t in this table.  */);
1060   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1061   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1062   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1063
1064   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1065                doc: /*
1066 A char-table for width (columns) of each character.  */);
1067   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1068   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1069   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1070                         make_number (4));
1071
1072   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1073                doc: /* A char-table for each printable character.  */);
1074   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1075   Fset_char_table_range (Vprintable_chars,
1076                          Fcons (make_number (32), make_number (126)), Qt);
1077   Fset_char_table_range (Vprintable_chars,
1078                          Fcons (make_number (160),
1079                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1080
1081   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1082                doc: /* Char table of script symbols.
1083 It has one extra slot whose value is a list of script symbols.  */);
1084
1085   /* Intern this now in case it isn't already done.
1086      Setting this variable twice is harmless.
1087      But don't staticpro it here--that is done in alloc.c.  */
1088   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1089   DEFSYM (Qchar_script_table, "char-script-table");
1090   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1091   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1092
1093   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1094                doc: /* Alist of scripts vs the representative characters.
1095 Each element is a cons (SCRIPT . CHARS).
1096 SCRIPT is a symbol representing a script or a subgroup of a script.
1097 CHARS is a list or a vector of characters.
1098 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1099 If it is a vector, one of the characters in the vector is necessary.
1100 This variable is used to find a font for a specific script.  */);
1101   Vscript_representative_chars = Qnil;
1102
1103   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1104                doc: /* Char table of Unicode's "General Category".
1105 All Unicode characters have one of the following values (symbol):
1106   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1107   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1108 See The Unicode Standard for the meaning of those values.  */);
1109   /* The correct char-table is setup in characters.el.  */
1110   Vunicode_category_table = Qnil;
1111 }
1112
1113 #endif /* emacs */