src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2012  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include <intprops.h>
  39 #include "lisp.h"
  40 #include "character.h"
  41 #include "buffer.h"
  42 #include "charset.h"
  43 #include "composite.h"
  44 #include "disptab.h"
  45
  46 #else  /* not emacs */
  47
  48 #include "mulelib.h"
  49
  50 #endif /* emacs */
  51
  52 Lisp_Object Qcharacterp;
  53
  54 static Lisp_Object Qauto_fill_chars;
  55
  56 /* Char-table of information about which character to unify to which
  57    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  58 Lisp_Object Vchar_unify_table;
  59
  60 static Lisp_Object Qchar_script_table;
  61
  62 \f
  63
  64 /* If character code C has modifier masks, reflect them to the
  65    character code if possible.  Return the resulting code.  */
  66
  67 EMACS_INT
  68 char_resolve_modifier_mask (EMACS_INT c)
  69 {
  70   /* A non-ASCII character can't reflect modifier bits to the code.  */
  71   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  72     return c;
  73
  74   /* For Meta, Shift, and Control modifiers, we need special care.  */
  75   if (c & CHAR_SHIFT)
  76     {
  77       /* Shift modifier is valid only with [A-Za-z].  */
  78       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  79         c &= ~CHAR_SHIFT;
  80       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  81         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  82       /* Shift modifier for control characters and SPC is ignored.  */
  83       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  84         c &= ~CHAR_SHIFT;
  85     }
  86   if (c & CHAR_CTL)
  87     {
  88       /* Simulate the code in lread.c.  */
  89       /* Allow `\C- ' and `\C-?'.  */
  90       if ((c & 0377) == ' ')
  91         c &= ~0177 & ~ CHAR_CTL;
  92       else if ((c & 0377) == '?')
  93         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  94       /* ASCII control chars are made from letters (both cases),
  95          as well as the non-letters within 0100...0137.  */
  96       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  97         c &= (037 | (~0177 & ~CHAR_CTL));
  98       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
  99         c &= (037 | (~0177 & ~CHAR_CTL));
 100     }
 101 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 102   if (c & CHAR_META)
 103     {
 104       /* Move the meta bit to the right place for a string.  */
 105       c = (c & ~CHAR_META) | 0x80;
 106     }
 107 #endif
 108
 109   return c;
 110 }
 111
 112
 113 /* Store multibyte form of character C at P.  If C has modifier bits,
 114    handle them appropriately.  */
 115
 116 int
 117 char_string (unsigned int c, unsigned char *p)
 118 {
 119   int bytes;
 120
 121   if (c & CHAR_MODIFIER_MASK)
 122     {
 123       c = char_resolve_modifier_mask (c);
 124       /* If C still has any modifier bits, just ignore it.  */
 125       c &= ~CHAR_MODIFIER_MASK;
 126     }
 127
 128   MAYBE_UNIFY_CHAR (c);
 129
 130   if (c <= MAX_3_BYTE_CHAR)
 131     {
 132       bytes = CHAR_STRING (c, p);
 133     }
 134   else if (c <= MAX_4_BYTE_CHAR)
 135     {
 136       p[0] = (0xF0 | (c >> 18));
 137       p[1] = (0x80 | ((c >> 12) & 0x3F));
 138       p[2] = (0x80 | ((c >> 6) & 0x3F));
 139       p[3] = (0x80 | (c & 0x3F));
 140       bytes = 4;
 141     }
 142   else if (c <= MAX_5_BYTE_CHAR)
 143     {
 144       p[0] = 0xF8;
 145       p[1] = (0x80 | ((c >> 18) & 0x0F));
 146       p[2] = (0x80 | ((c >> 12) & 0x3F));
 147       p[3] = (0x80 | ((c >> 6) & 0x3F));
 148       p[4] = (0x80 | (c & 0x3F));
 149       bytes = 5;
 150     }
 151   else if (c <= MAX_CHAR)
 152     {
 153       c = CHAR_TO_BYTE8 (c);
 154       bytes = BYTE8_STRING (c, p);
 155     }
 156   else
 157     error ("Invalid character: %x", c);
 158
 159   return bytes;
 160 }
 161
 162
 163 /* Return a character whose multibyte form is at P.  If LEN is not
 164    NULL, it must be a pointer to integer.  In that case, set *LEN to
 165    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 166    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 167    the ending address (i.e., the starting address of the next
 168    character) of the multibyte form.  */
 169
 170 int
 171 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 172 {
 173   int c;
 174   const unsigned char *saved_p = p;
 175
 176   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 177     {
 178       c = STRING_CHAR_ADVANCE (p);
 179     }
 180   else if (! (*p & 0x08))
 181     {
 182       c = ((((p)[0] & 0xF) << 18)
 183            | (((p)[1] & 0x3F) << 12)
 184            | (((p)[2] & 0x3F) << 6)
 185            | ((p)[3] & 0x3F));
 186       p += 4;
 187     }
 188   else
 189     {
 190       c = ((((p)[1] & 0x3F) << 18)
 191            | (((p)[2] & 0x3F) << 12)
 192            | (((p)[3] & 0x3F) << 6)
 193            | ((p)[4] & 0x3F));
 194       p += 5;
 195     }
 196
 197   MAYBE_UNIFY_CHAR (c);
 198
 199   if (len)
 200     *len = p - saved_p;
 201   if (advanced)
 202     *advanced = p;
 203   return c;
 204 }
 205
 206
 207 /* Translate character C by translation table TABLE.  If no translation is
 208    found in TABLE, return the untranslated character.  If TABLE is a list,
 209    elements are char tables.  In that case, recursively translate C by all the
 210    tables in the list.  */
 211
 212 int
 213 translate_char (Lisp_Object table, int c)
 214 {
 215   if (CHAR_TABLE_P (table))
 216     {
 217       Lisp_Object ch;
 218
 219       ch = CHAR_TABLE_REF (table, c);
 220       if (CHARACTERP (ch))
 221         c = XINT (ch);
 222     }
 223   else
 224     {
 225       for (; CONSP (table); table = XCDR (table))
 226         c = translate_char (XCAR (table), c);
 227     }
 228   return c;
 229 }
 230
 231 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 232    them, return (C & 0xFF).  */
 233
 234 int
 235 multibyte_char_to_unibyte (int c)
 236 {
 237   if (c < 0x80)
 238     return c;
 239   if (CHAR_BYTE8_P (c))
 240     return CHAR_TO_BYTE8 (c);
 241   return (c & 0xFF);
 242 }
 243
 244 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 245    by charset_unibyte.  */
 246
 247 int
 248 multibyte_char_to_unibyte_safe (int c)
 249 {
 250   if (c < 0x80)
 251     return c;
 252   if (CHAR_BYTE8_P (c))
 253     return CHAR_TO_BYTE8 (c);
 254   return -1;
 255 }
 256
 257 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 258        doc: /* Return non-nil if OBJECT is a character.
 259 usage: (characterp OBJECT)  */)
 260   (Lisp_Object object, Lisp_Object ignore)
 261 {
 262   return (CHARACTERP (object) ? Qt : Qnil);
 263 }
 264
 265 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 266        doc: /* Return the character of the maximum code.  */)
 267   (void)
 268 {
 269   return make_number (MAX_CHAR);
 270 }
 271
 272 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 273        Sunibyte_char_to_multibyte, 1, 1, 0,
 274        doc: /* Convert the byte CH to multibyte character.  */)
 275   (Lisp_Object ch)
 276 {
 277   int c;
 278
 279   CHECK_CHARACTER (ch);
 280   c = XFASTINT (ch);
 281   if (c >= 0x100)
 282     error ("Not a unibyte character: %d", c);
 283   MAKE_CHAR_MULTIBYTE (c);
 284   return make_number (c);
 285 }
 286
 287 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 288        Smultibyte_char_to_unibyte, 1, 1, 0,
 289        doc: /* Convert the multibyte character CH to a byte.
 290 If the multibyte character does not represent a byte, return -1.  */)
 291   (Lisp_Object ch)
 292 {
 293   int cm;
 294
 295   CHECK_CHARACTER (ch);
 296   cm = XFASTINT (ch);
 297   if (cm < 256)
 298     /* Can't distinguish a byte read from a unibyte buffer from
 299        a latin1 char, so let's let it slide.  */
 300     return ch;
 301   else
 302     {
 303       int cu = CHAR_TO_BYTE_SAFE (cm);
 304       return make_number (cu);
 305     }
 306 }
 307
 308
 309 /* Return width (columns) of C considering the buffer display table DP. */
 310
 311 static ptrdiff_t
 312 char_width (int c, struct Lisp_Char_Table *dp)
 313 {
 314   ptrdiff_t width = CHAR_WIDTH (c);
 315
 316   if (dp)
 317     {
 318       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 319       int i;
 320
 321       if (VECTORP (disp))
 322         for (i = 0, width = 0; i < ASIZE (disp); i++)
 323           {
 324             ch = AREF (disp, i);
 325             if (CHARACTERP (ch))
 326               {
 327                 int w = CHAR_WIDTH (XFASTINT (ch));
 328                 if (INT_ADD_OVERFLOW (width, w))
 329                   string_overflow ();
 330                 width += w;
 331               }
 332           }
 333     }
 334   return width;
 335 }
 336
 337
 338 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 339        doc: /* Return width of CHAR when displayed in the current buffer.
 340 The width is measured by how many columns it occupies on the screen.
 341 Tab is taken to occupy `tab-width' columns.
 342 usage: (char-width CHAR)  */)
 343   (Lisp_Object ch)
 344 {
 345   int c;
 346   ptrdiff_t width;
 347
 348   CHECK_CHARACTER (ch);
 349   c = XINT (ch);
 350   width = char_width (c, buffer_display_table ());
 351   return make_number (width);
 352 }
 353
 354 /* Return width of string STR of length LEN when displayed in the
 355    current buffer.  The width is measured by how many columns it
 356    occupies on the screen.  If PRECISION > 0, return the width of
 357    longest substring that doesn't exceed PRECISION, and set number of
 358    characters and bytes of the substring in *NCHARS and *NBYTES
 359    respectively.  */
 360
 361 ptrdiff_t
 362 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 363                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 364 {
 365   ptrdiff_t i = 0, i_byte = 0;
 366   ptrdiff_t width = 0;
 367   struct Lisp_Char_Table *dp = buffer_display_table ();
 368
 369   while (i_byte < len)
 370     {
 371       int bytes;
 372       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 373       ptrdiff_t thiswidth = char_width (c, dp);
 374
 375       if (precision <= 0)
 376         {
 377           if (INT_ADD_OVERFLOW (width, thiswidth))
 378             string_overflow ();
 379         }
 380       else if (precision - width < thiswidth)
 381         {
 382           *nchars = i;
 383           *nbytes = i_byte;
 384           return width;
 385         }
 386       i++;
 387       i_byte += bytes;
 388       width += thiswidth;
 389   }
 390
 391   if (precision > 0)
 392     {
 393       *nchars = i;
 394       *nbytes = i_byte;
 395     }
 396
 397   return width;
 398 }
 399
 400 /* Return width of string STR of length LEN when displayed in the
 401    current buffer.  The width is measured by how many columns it
 402    occupies on the screen.  */
 403
 404 ptrdiff_t
 405 strwidth (const char *str, ptrdiff_t len)
 406 {
 407   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 408 }
 409
 410 /* Return width of Lisp string STRING when displayed in the current
 411    buffer.  The width is measured by how many columns it occupies on
 412    the screen while paying attention to compositions.  If PRECISION >
 413    0, return the width of longest substring that doesn't exceed
 414    PRECISION, and set number of characters and bytes of the substring
 415    in *NCHARS and *NBYTES respectively.  */
 416
 417 ptrdiff_t
 418 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 419                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 420 {
 421   ptrdiff_t len = SCHARS (string);
 422   /* This set multibyte to 0 even if STRING is multibyte when it
 423      contains only ascii and eight-bit-graphic, but that's
 424      intentional.  */
 425   int multibyte = len < SBYTES (string);
 426   unsigned char *str = SDATA (string);
 427   ptrdiff_t i = 0, i_byte = 0;
 428   ptrdiff_t width = 0;
 429   struct Lisp_Char_Table *dp = buffer_display_table ();
 430
 431   while (i < len)
 432     {
 433       ptrdiff_t chars, bytes, thiswidth;
 434       Lisp_Object val;
 435       ptrdiff_t cmp_id;
 436       ptrdiff_t ignore, end;
 437
 438       if (find_composition (i, -1, &ignore, &end, &val, string)
 439           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 440               >= 0))
 441         {
 442           thiswidth = composition_table[cmp_id]->width;
 443           chars = end - i;
 444           bytes = string_char_to_byte (string, end) - i_byte;
 445         }
 446       else
 447         {
 448           int c;
 449
 450           if (multibyte)
 451             {
 452               int cbytes;
 453               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 454               bytes = cbytes;
 455             }
 456           else
 457             c = str[i_byte], bytes = 1;
 458           chars = 1;
 459           thiswidth = char_width (c, dp);
 460         }
 461
 462       if (precision <= 0)
 463         {
 464 #ifdef emacs
 465           if (INT_ADD_OVERFLOW (width, thiswidth))
 466             string_overflow ();
 467 #endif
 468         }
 469       else if (precision - width < thiswidth)
 470         {
 471           *nchars = i;
 472           *nbytes = i_byte;
 473           return width;
 474         }
 475       i += chars;
 476       i_byte += bytes;
 477       width += thiswidth;
 478     }
 479
 480   if (precision > 0)
 481     {
 482       *nchars = i;
 483       *nbytes = i_byte;
 484     }
 485
 486   return width;
 487 }
 488
 489 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 490        doc: /* Return width of STRING when displayed in the current buffer.
 491 Width is measured by how many columns it occupies on the screen.
 492 When calculating width of a multibyte character in STRING,
 493 only the base leading-code is considered; the validity of
 494 the following bytes is not checked.  Tabs in STRING are always
 495 taken to occupy `tab-width' columns.
 496 usage: (string-width STRING)  */)
 497   (Lisp_Object str)
 498 {
 499   Lisp_Object val;
 500
 501   CHECK_STRING (str);
 502   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 503   return val;
 504 }
 505
 506 /* Return the number of characters in the NBYTES bytes at PTR.
 507    This works by looking at the contents and checking for multibyte
 508    sequences while assuming that there's no invalid sequence.
 509    However, if the current buffer has enable-multibyte-characters =
 510    nil, we treat each byte as a character.  */
 511
 512 ptrdiff_t
 513 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 514 {
 515   /* current_buffer is null at early stages of Emacs initialization.  */
 516   if (current_buffer == 0
 517       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 518     return nbytes;
 519
 520   return multibyte_chars_in_text (ptr, nbytes);
 521 }
 522
 523 /* Return the number of characters in the NBYTES bytes at PTR.
 524    This works by looking at the contents and checking for multibyte
 525    sequences while assuming that there's no invalid sequence.  It
 526    ignores enable-multibyte-characters.  */
 527
 528 ptrdiff_t
 529 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 530 {
 531   const unsigned char *endp = ptr + nbytes;
 532   ptrdiff_t chars = 0;
 533
 534   while (ptr < endp)
 535     {
 536       int len = MULTIBYTE_LENGTH (ptr, endp);
 537
 538       if (len == 0)
 539         abort ();
 540       ptr += len;
 541       chars++;
 542     }
 543
 544   return chars;
 545 }
 546
 547 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 548    characters and bytes in it, and store them in *NCHARS and *NBYTES
 549    respectively.  On counting bytes, pay attention to that 8-bit
 550    characters not constructing a valid multibyte sequence are
 551    represented by 2-byte in a multibyte text.  */
 552
 553 void
 554 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 555                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 556 {
 557   const unsigned char *endp = str + len;
 558   int n;
 559   ptrdiff_t chars = 0, bytes = 0;
 560
 561   if (len >= MAX_MULTIBYTE_LENGTH)
 562     {
 563       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 564       while (str < adjusted_endp)
 565         {
 566           if (! CHAR_BYTE8_HEAD_P (*str)
 567               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 568             str += n, bytes += n;
 569           else
 570             str++, bytes += 2;
 571           chars++;
 572         }
 573     }
 574   while (str < endp)
 575     {
 576       if (! CHAR_BYTE8_HEAD_P (*str)
 577           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 578         str += n, bytes += n;
 579       else
 580         str++, bytes += 2;
 581       chars++;
 582     }
 583
 584   *nchars = chars;
 585   *nbytes = bytes;
 586   return;
 587 }
 588
 589 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 590    It actually converts only such 8-bit characters that don't construct
 591    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 592    NCHARS is nonzero, set *NCHARS to the number of characters in the
 593    text.  It is assured that we can use LEN bytes at STR as a work
 594    area and that is enough.  Return the number of bytes of the
 595    resulting text.  */
 596
 597 ptrdiff_t
 598 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 599                   ptrdiff_t *nchars)
 600 {
 601   unsigned char *p = str, *endp = str + nbytes;
 602   unsigned char *to;
 603   ptrdiff_t chars = 0;
 604   int n;
 605
 606   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 607     {
 608       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 609       while (p < adjusted_endp
 610              && ! CHAR_BYTE8_HEAD_P (*p)
 611              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 612         p += n, chars++;
 613     }
 614   while (p < endp
 615          && ! CHAR_BYTE8_HEAD_P (*p)
 616          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 617     p += n, chars++;
 618   if (nchars)
 619     *nchars = chars;
 620   if (p == endp)
 621     return nbytes;
 622
 623   to = p;
 624   nbytes = endp - p;
 625   endp = str + len;
 626   memmove (endp - nbytes, p, nbytes);
 627   p = endp - nbytes;
 628
 629   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 630     {
 631       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 632       while (p < adjusted_endp)
 633         {
 634           if (! CHAR_BYTE8_HEAD_P (*p)
 635               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 636             {
 637               while (n--)
 638                 *to++ = *p++;
 639             }
 640           else
 641             {
 642               int c = *p++;
 643               c = BYTE8_TO_CHAR (c);
 644               to += CHAR_STRING (c, to);
 645             }
 646         }
 647       chars++;
 648     }
 649   while (p < endp)
 650     {
 651       if (! CHAR_BYTE8_HEAD_P (*p)
 652           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 653         {
 654           while (n--)
 655             *to++ = *p++;
 656         }
 657       else
 658         {
 659           int c = *p++;
 660           c = BYTE8_TO_CHAR (c);
 661           to += CHAR_STRING (c, to);
 662         }
 663       chars++;
 664     }
 665   if (nchars)
 666     *nchars = chars;
 667   return (to - str);
 668 }
 669
 670 /* Parse unibyte string at STR of LEN bytes, and return the number of
 671    bytes it may occupy when converted to multibyte string by
 672    `str_to_multibyte'.  */
 673
 674 ptrdiff_t
 675 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 676 {
 677   const unsigned char *endp = str + len;
 678   ptrdiff_t bytes;
 679
 680   for (bytes = 0; str < endp; str++)
 681     {
 682       int n = *str < 0x80 ? 1 : 2;
 683       if (INT_ADD_OVERFLOW (bytes, n))
 684         string_overflow ();
 685       bytes += n;
 686     }
 687   return bytes;
 688 }
 689
 690
 691 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 692    that contains the same single-byte characters.  It actually
 693    converts all 8-bit characters to multibyte forms.  It is assured
 694    that we can use LEN bytes at STR as a work area and that is
 695    enough.  */
 696
 697 ptrdiff_t
 698 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 699 {
 700   unsigned char *p = str, *endp = str + bytes;
 701   unsigned char *to;
 702
 703   while (p < endp && *p < 0x80) p++;
 704   if (p == endp)
 705     return bytes;
 706   to = p;
 707   bytes = endp - p;
 708   endp = str + len;
 709   memmove (endp - bytes, p, bytes);
 710   p = endp - bytes;
 711   while (p < endp)
 712     {
 713       int c = *p++;
 714
 715       if (c >= 0x80)
 716         c = BYTE8_TO_CHAR (c);
 717       to += CHAR_STRING (c, to);
 718     }
 719   return (to - str);
 720 }
 721
 722 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 723    actually converts characters in the range 0x80..0xFF to
 724    unibyte.  */
 725
 726 ptrdiff_t
 727 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 728 {
 729   const unsigned char *p = str, *endp = str + bytes;
 730   unsigned char *to;
 731   int c, len;
 732
 733   while (p < endp)
 734     {
 735       c = *p;
 736       len = BYTES_BY_CHAR_HEAD (c);
 737       if (CHAR_BYTE8_HEAD_P (c))
 738         break;
 739       p += len;
 740     }
 741   to = str + (p - str);
 742   while (p < endp)
 743     {
 744       c = *p;
 745       len = BYTES_BY_CHAR_HEAD (c);
 746       if (CHAR_BYTE8_HEAD_P (c))
 747         {
 748           c = STRING_CHAR_ADVANCE (p);
 749           *to++ = CHAR_TO_BYTE8 (c);
 750         }
 751       else
 752         {
 753           while (len--) *to++ = *p++;
 754         }
 755     }
 756   return (to - str);
 757 }
 758
 759 /* Convert eight-bit chars in SRC (in multibyte form) to the
 760    corresponding byte and store in DST.  CHARS is the number of
 761    characters in SRC.  The value is the number of bytes stored in DST.
 762    Usually, the value is the same as CHARS, but is less than it if SRC
 763    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 764    is nonzero, a Latin-1 character is accepted and converted to a byte
 765    of that character code.
 766    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 767
 768 ptrdiff_t
 769 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars, int accept_latin_1)
 770 {
 771   ptrdiff_t i;
 772
 773   for (i = 0; i < chars; i++)
 774     {
 775       int c = STRING_CHAR_ADVANCE (src);
 776
 777       if (CHAR_BYTE8_P (c))
 778         c = CHAR_TO_BYTE8 (c);
 779       else if (! ASCII_CHAR_P (c)
 780                && (! accept_latin_1 || c >= 0x100))
 781         return i;
 782       *dst++ = c;
 783     }
 784   return i;
 785 }
 786
 787
 788 static ptrdiff_t
 789 string_count_byte8 (Lisp_Object string)
 790 {
 791   int multibyte = STRING_MULTIBYTE (string);
 792   ptrdiff_t nbytes = SBYTES (string);
 793   unsigned char *p = SDATA (string);
 794   unsigned char *pend = p + nbytes;
 795   ptrdiff_t count = 0;
 796   int c, len;
 797
 798   if (multibyte)
 799     while (p < pend)
 800       {
 801         c = *p;
 802         len = BYTES_BY_CHAR_HEAD (c);
 803
 804         if (CHAR_BYTE8_HEAD_P (c))
 805           count++;
 806         p += len;
 807       }
 808   else
 809     while (p < pend)
 810       {
 811         if (*p++ >= 0x80)
 812           count++;
 813       }
 814   return count;
 815 }
 816
 817
 818 Lisp_Object
 819 string_escape_byte8 (Lisp_Object string)
 820 {
 821   ptrdiff_t nchars = SCHARS (string);
 822   ptrdiff_t nbytes = SBYTES (string);
 823   int multibyte = STRING_MULTIBYTE (string);
 824   ptrdiff_t byte8_count;
 825   const unsigned char *src, *src_end;
 826   unsigned char *dst;
 827   Lisp_Object val;
 828   int c, len;
 829
 830   if (multibyte && nchars == nbytes)
 831     return string;
 832
 833   byte8_count = string_count_byte8 (string);
 834
 835   if (byte8_count == 0)
 836     return string;
 837
 838   if (multibyte)
 839     {
 840       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 841           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 842         string_overflow ();
 843
 844       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 845       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 846                                           nbytes + byte8_count * 2);
 847     }
 848   else
 849     {
 850       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 851         string_overflow ();
 852
 853       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 854       val = make_uninit_string (nbytes + byte8_count * 3);
 855     }
 856
 857   src = SDATA (string);
 858   src_end = src + nbytes;
 859   dst = SDATA (val);
 860   if (multibyte)
 861     while (src < src_end)
 862       {
 863         c = *src;
 864         len = BYTES_BY_CHAR_HEAD (c);
 865
 866         if (CHAR_BYTE8_HEAD_P (c))
 867           {
 868             c = STRING_CHAR_ADVANCE (src);
 869             c = CHAR_TO_BYTE8 (c);
 870             sprintf ((char *) dst, "\\%03o", c);
 871             dst += 4;
 872           }
 873         else
 874           while (len--) *dst++ = *src++;
 875       }
 876   else
 877     while (src < src_end)
 878       {
 879         c = *src++;
 880         if (c >= 0x80)
 881           {
 882             sprintf ((char *) dst, "\\%03o", c);
 883             dst += 4;
 884           }
 885         else
 886           *dst++ = c;
 887       }
 888   return val;
 889 }
 890
 891 \f
 892 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 893        doc: /*
 894 Concatenate all the argument characters and make the result a string.
 895 usage: (string &rest CHARACTERS)  */)
 896   (ptrdiff_t n, Lisp_Object *args)
 897 {
 898   ptrdiff_t i;
 899   int c;
 900   unsigned char *buf, *p;
 901   Lisp_Object str;
 902   USE_SAFE_ALLOCA;
 903
 904   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 905   p = buf;
 906
 907   for (i = 0; i < n; i++)
 908     {
 909       CHECK_CHARACTER (args[i]);
 910       c = XINT (args[i]);
 911       p += CHAR_STRING (c, p);
 912     }
 913
 914   str = make_string_from_bytes ((char *) buf, n, p - buf);
 915   SAFE_FREE ();
 916   return str;
 917 }
 918
 919 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 920        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 921 usage: (unibyte-string &rest BYTES)  */)
 922   (ptrdiff_t n, Lisp_Object *args)
 923 {
 924   ptrdiff_t i;
 925   unsigned char *buf, *p;
 926   Lisp_Object str;
 927   USE_SAFE_ALLOCA;
 928
 929   SAFE_ALLOCA (buf, unsigned char *, n);
 930   p = buf;
 931
 932   for (i = 0; i < n; i++)
 933     {
 934       CHECK_RANGED_INTEGER (0, args[i], 255);
 935       *p++ = XINT (args[i]);
 936     }
 937
 938   str = make_string_from_bytes ((char *) buf, n, p - buf);
 939   SAFE_FREE ();
 940   return str;
 941 }
 942
 943 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 944        Schar_resolve_modifiers, 1, 1, 0,
 945        doc: /* Resolve modifiers in the character CHAR.
 946 The value is a character with modifiers resolved into the character
 947 code.  Unresolved modifiers are kept in the value.
 948 usage: (char-resolve-modifiers CHAR)  */)
 949   (Lisp_Object character)
 950 {
 951   EMACS_INT c;
 952
 953   CHECK_NUMBER (character);
 954   c = XINT (character);
 955   return make_number (char_resolve_modifier_mask (c));
 956 }
 957
 958 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 959        doc: /* Return a byte value of a character at point.
 960 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 961 a byte value.
 962 Optional 2nd arg STRING, if non-nil, is a string of which first
 963 character is a target to get a byte value.  In this case, POSITION, if
 964 non-nil, is an index of a target character in the string.
 965
 966 If the current buffer (or STRING) is multibyte, and the target
 967 character is not ASCII nor 8-bit character, an error is signaled.  */)
 968   (Lisp_Object position, Lisp_Object string)
 969 {
 970   int c;
 971   ptrdiff_t pos;
 972   unsigned char *p;
 973
 974   if (NILP (string))
 975     {
 976       if (NILP (position))
 977         {
 978           p = PT_ADDR;
 979         }
 980       else
 981         {
 982           CHECK_NUMBER_COERCE_MARKER (position);
 983           if (XINT (position) < BEGV || XINT (position) >= ZV)
 984             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 985           pos = XFASTINT (position);
 986           p = CHAR_POS_ADDR (pos);
 987         }
 988       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 989         return make_number (*p);
 990     }
 991   else
 992     {
 993       CHECK_STRING (string);
 994       if (NILP (position))
 995         {
 996           p = SDATA (string);
 997         }
 998       else
 999         {
1000           CHECK_NATNUM (position);
1001           if (XINT (position) >= SCHARS (string))
1002             args_out_of_range (string, position);
1003           pos = XFASTINT (position);
1004           p = SDATA (string) + string_char_to_byte (string, pos);
1005         }
1006       if (! STRING_MULTIBYTE (string))
1007         return make_number (*p);
1008     }
1009   c = STRING_CHAR (p);
1010   if (CHAR_BYTE8_P (c))
1011     c = CHAR_TO_BYTE8 (c);
1012   else if (! ASCII_CHAR_P (c))
1013     error ("Not an ASCII nor an 8-bit character: %d", c);
1014   return make_number (c);
1015 }
1016
1017
1018 void
1019 init_character_once (void)
1020 {
1021 }
1022
1023 #ifdef emacs
1024
1025 void
1026 syms_of_character (void)
1027 {
1028   DEFSYM (Qcharacterp, "characterp");
1029   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1030
1031   staticpro (&Vchar_unify_table);
1032   Vchar_unify_table = Qnil;
1033
1034   defsubr (&Smax_char);
1035   defsubr (&Scharacterp);
1036   defsubr (&Sunibyte_char_to_multibyte);
1037   defsubr (&Smultibyte_char_to_unibyte);
1038   defsubr (&Schar_width);
1039   defsubr (&Sstring_width);
1040   defsubr (&Sstring);
1041   defsubr (&Sunibyte_string);
1042   defsubr (&Schar_resolve_modifiers);
1043   defsubr (&Sget_byte);
1044
1045   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1046                doc: /*
1047 Vector recording all translation tables ever defined.
1048 Each element is a pair (SYMBOL . TABLE) relating the table to the
1049 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1050   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1051
1052   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1053                doc: /*
1054 A char-table for characters which invoke auto-filling.
1055 Such characters have value t in this table.  */);
1056   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1057   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1058   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1059
1060   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1061                doc: /*
1062 A char-table for width (columns) of each character.  */);
1063   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1064   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1065   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1066                         make_number (4));
1067
1068   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1069                doc: /* A char-table for each printable character.  */);
1070   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1071   Fset_char_table_range (Vprintable_chars,
1072                          Fcons (make_number (32), make_number (126)), Qt);
1073   Fset_char_table_range (Vprintable_chars,
1074                          Fcons (make_number (160),
1075                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1076
1077   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1078                doc: /* Char table of script symbols.
1079 It has one extra slot whose value is a list of script symbols.  */);
1080
1081   /* Intern this now in case it isn't already done.
1082      Setting this variable twice is harmless.
1083      But don't staticpro it here--that is done in alloc.c.  */
1084   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1085   DEFSYM (Qchar_script_table, "char-script-table");
1086   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1087   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1088
1089   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1090                doc: /* Alist of scripts vs the representative characters.
1091 Each element is a cons (SCRIPT . CHARS).
1092 SCRIPT is a symbol representing a script or a subgroup of a script.
1093 CHARS is a list or a vector of characters.
1094 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1095 If it is a vector, one of the characters in the vector is necessary.
1096 This variable is used to find a font for a specific script.  */);
1097   Vscript_representative_chars = Qnil;
1098
1099   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1100                doc: /* Char table of Unicode's "General Category".
1101 All Unicode characters have one of the following values (symbol):
1102   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1103   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1104 See The Unicode Standard for the meaning of those values.  */);
1105   /* The correct char-table is setup in characters.el.  */
1106   Vunicode_category_table = Qnil;
1107 }
1108
1109 #endif /* emacs */