Add C interface for Unicode character property table.
[bpt/emacs.git] / src / character.h
index 4c468e1..d8e77c5 100644 (file)
@@ -23,6 +23,8 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 #ifndef EMACS_CHARACTER_H
 #define EMACS_CHARACTER_H
 
+#include <verify.h>
+
 /* character code      1st byte   byte sequence
    --------------      --------   -------------
         0-7F           00..7F     0xxxxxxx
@@ -69,7 +71,7 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 #define CHAR_TO_BYTE8(c)       \
   (CHAR_BYTE8_P (c)            \
    ? (c) - 0x3FFF00            \
-   : multibyte_char_to_unibyte (c, Qnil))
+   : multibyte_char_to_unibyte (c))
 
 /* Return the raw 8-bit byte for character C,
    or -1 if C doesn't correspond to a byte.  */
@@ -102,13 +104,13 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 #define make_char(c) make_number (c)
 
 /* Nonzero iff C is an ASCII byte.  */
-#define ASCII_BYTE_P(c) ((unsigned) (c) < 0x80)
+#define ASCII_BYTE_P(c) UNSIGNED_CMP (c, <, 0x80)
 
 /* Nonzero iff X is a character.  */
 #define CHARACTERP(x) (NATNUMP (x) && XFASTINT (x) <= MAX_CHAR)
 
-/* Nonzero iff C is valid as a character code.  GENERICP is not used.  */
-#define CHAR_VALID_P(c, genericp) ((unsigned) (c) <= MAX_CHAR)
+/* Nonzero iff C is valid as a character code.  */
+#define CHAR_VALID_P(c) UNSIGNED_CMP (c, <=, MAX_CHAR)
 
 /* Check if Lisp object X is a character or not.  */
 #define CHECK_CHARACTER(x) \
@@ -129,7 +131,7 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
   } while (0)
 
 /* Nonzero iff C is a character of code less than 0x100.  */
-#define SINGLE_BYTE_CHAR_P(c) ((unsigned) (c) < 0x100)
+#define SINGLE_BYTE_CHAR_P(c) UNSIGNED_CMP (c, <, 0x100)
 
 /* Nonzero if character C has a printable glyph.  */
 #define CHAR_PRINTABLE_P(c)    \
@@ -161,19 +163,19 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
    Returns the length of the multibyte form.  */
 
 #define CHAR_STRING(c, p)                      \
-  ((unsigned) (c) <= MAX_1_BYTE_CHAR           \
+  (UNSIGNED_CMP (c, <=, MAX_1_BYTE_CHAR)       \
    ? ((p)[0] = (c),                            \
       1)                                       \
-   : (unsigned) (c) <= MAX_2_BYTE_CHAR         \
+   : UNSIGNED_CMP (c, <=, MAX_2_BYTE_CHAR)     \
    ? ((p)[0] = (0xC0 | ((c) >> 6)),            \
       (p)[1] = (0x80 | ((c) & 0x3F)),          \
       2)                                       \
-   : (unsigned) (c) <= MAX_3_BYTE_CHAR         \
+   : UNSIGNED_CMP (c, <=, MAX_3_BYTE_CHAR)     \
    ? ((p)[0] = (0xE0 | ((c) >> 12)),           \
       (p)[1] = (0x80 | (((c) >> 6) & 0x3F)),   \
       (p)[2] = (0x80 | ((c) & 0x3F)),          \
       3)                                       \
-   : char_string ((unsigned) c, p))
+   : verify_expr (sizeof (c) <= sizeof (unsigned), char_string (c, p)))
 
 /* Store multibyte form of byte B in P.  The caller should allocate at
    least MAX_MULTIBYTE_LENGTH bytes area at P in advance.  Returns the
@@ -201,7 +203,10 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
        *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
        *(p)++ = (0x80 | ((c) & 0x3F));         \
     else                                       \
-      (p) += char_string ((c), (p));           \
+      {                                                \
+       verify (sizeof (c) <= sizeof (unsigned));       \
+       (p) += char_string (c, p);              \
+      }                                                \
   } while (0)
 
 
@@ -278,11 +283,11 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
   do {                                                                 \
     if ((p) > (limit))                                                 \
       {                                                                        \
-       const unsigned char *pcb = (p);                                 \
+       const unsigned char *chp = (p);                                 \
        do {                                                            \
-         pcb--;                                                        \
-       } while (pcb >= limit && ! CHAR_HEAD_P (*pcb));                 \
-       (p) = (BYTES_BY_CHAR_HEAD (*pcb) == (p) - pcb) ? pcb : (p) - 1; \
+         chp--;                                                        \
+       } while (chp >= limit && ! CHAR_HEAD_P (*chp));                 \
+       (p) = (BYTES_BY_CHAR_HEAD (*chp) == (p) - chp) ? chp : (p) - 1; \
       }                                                                        \
   } while (0)
 
@@ -353,11 +358,11 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
       CHARIDX++;                                                       \
       if (STRING_MULTIBYTE (STRING))                                   \
        {                                                               \
-         unsigned char *string_ptr = &SDATA (STRING)[BYTEIDX];         \
-         int string_len;                                               \
+         unsigned char *chp = &SDATA (STRING)[BYTEIDX];                \
+         int chlen;                                                    \
                                                                        \
-         OUTPUT = STRING_CHAR_AND_LENGTH (string_ptr, string_len);     \
-         BYTEIDX += string_len;                                        \
+         OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen);                 \
+         BYTEIDX += chlen;                                             \
        }                                                               \
       else                                                             \
        {                                                               \
@@ -376,11 +381,11 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
       CHARIDX++;                                                             \
       if (STRING_MULTIBYTE (STRING))                                         \
        {                                                                     \
-         unsigned char *ptr = &SDATA (STRING)[BYTEIDX];                      \
-         int ptrlen;                                                         \
+         unsigned char *chp = &SDATA (STRING)[BYTEIDX];                      \
+         int chlen;                                                          \
                                                                              \
-         OUTPUT = STRING_CHAR_AND_LENGTH (ptr, ptrlen);                      \
-         BYTEIDX += ptrlen;                                                  \
+         OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen);                       \
+         BYTEIDX += chlen;                                                   \
        }                                                                     \
       else                                                                   \
        {                                                                     \
@@ -416,11 +421,11 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
       CHARIDX++;                                               \
       if (!NILP (BVAR (current_buffer, enable_multibyte_characters)))  \
        {                                                       \
-         unsigned char *ptr = BYTE_POS_ADDR (BYTEIDX);         \
-         int string_len;                                       \
+         unsigned char *chp = BYTE_POS_ADDR (BYTEIDX);         \
+         int chlen;                                            \
                                                                \
-         OUTPUT= STRING_CHAR_AND_LENGTH (ptr, string_len);     \
-         BYTEIDX += string_len;                                \
+         OUTPUT= STRING_CHAR_AND_LENGTH (chp, chlen);          \
+         BYTEIDX += chlen;                                     \
        }                                                       \
       else                                                     \
        {                                                       \
@@ -436,11 +441,11 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 #define FETCH_CHAR_ADVANCE_NO_CHECK(OUTPUT, CHARIDX, BYTEIDX)  \
   do                                                           \
     {                                                          \
-      unsigned char *ptr = BYTE_POS_ADDR (BYTEIDX);            \
-      int len;                                                 \
+      unsigned char *chp = BYTE_POS_ADDR (BYTEIDX);            \
+      int chlen;                                                       \
                                                                \
-      OUTPUT = STRING_CHAR_AND_LENGTH (ptr, len);              \
-      BYTEIDX += len;                                          \
+      OUTPUT = STRING_CHAR_AND_LENGTH (chp, chlen);            \
+      BYTEIDX += chlen;                                                \
       CHARIDX++;                                               \
     }                                                          \
   while (0)
@@ -451,8 +456,8 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 
 #define INC_POS(pos_byte)                              \
   do {                                                 \
-    unsigned char *ptr = BYTE_POS_ADDR (pos_byte);     \
-    pos_byte += BYTES_BY_CHAR_HEAD (*ptr);             \
+    unsigned char *chp = BYTE_POS_ADDR (pos_byte);     \
+    pos_byte += BYTES_BY_CHAR_HEAD (*chp);             \
   } while (0)
 
 
@@ -461,16 +466,16 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 
 #define DEC_POS(pos_byte)                      \
   do {                                         \
-    unsigned char *ptr;                                \
+    unsigned char *chp;                                \
                                                \
     pos_byte--;                                        \
     if (pos_byte < GPT_BYTE)                   \
-      ptr = BEG_ADDR + pos_byte - BEG_BYTE;    \
+      chp = BEG_ADDR + pos_byte - BEG_BYTE;    \
     else                                       \
-      ptr = BEG_ADDR + GAP_SIZE + pos_byte - BEG_BYTE; \
-    while (!CHAR_HEAD_P (*ptr))                        \
+      chp = BEG_ADDR + GAP_SIZE + pos_byte - BEG_BYTE; \
+    while (!CHAR_HEAD_P (*chp))                        \
       {                                                \
-       ptr--;                                  \
+       chp--;                                  \
        pos_byte--;                             \
       }                                                \
   } while (0)
@@ -510,8 +515,8 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 
 #define BUF_INC_POS(buf, pos_byte)                             \
   do {                                                         \
-    unsigned char *bbp = BUF_BYTE_ADDRESS (buf, pos_byte);     \
-    pos_byte += BYTES_BY_CHAR_HEAD (*bbp);                     \
+    unsigned char *chp = BUF_BYTE_ADDRESS (buf, pos_byte);     \
+    pos_byte += BYTES_BY_CHAR_HEAD (*chp);                     \
   } while (0)
 
 
@@ -520,15 +525,15 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 
 #define BUF_DEC_POS(buf, pos_byte)                                     \
   do {                                                                 \
-    unsigned char *p;                                                  \
+    unsigned char *chp;                                                        \
     pos_byte--;                                                                \
     if (pos_byte < BUF_GPT_BYTE (buf))                                 \
-      p = BUF_BEG_ADDR (buf) + pos_byte - BEG_BYTE;                    \
+      chp = BUF_BEG_ADDR (buf) + pos_byte - BEG_BYTE;                  \
     else                                                               \
-      p = BUF_BEG_ADDR (buf) + BUF_GAP_SIZE (buf) + pos_byte - BEG_BYTE;\
-    while (!CHAR_HEAD_P (*p))                                          \
+      chp = BUF_BEG_ADDR (buf) + BUF_GAP_SIZE (buf) + pos_byte - BEG_BYTE;\
+    while (!CHAR_HEAD_P (*chp))                                                \
       {                                                                        \
-       p--;                                                            \
+       chp--;                                                          \
        pos_byte--;                                                     \
       }                                                                        \
   } while (0)
@@ -544,7 +549,7 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
        Lisp_Object val;                                \
        val = CHAR_TABLE_REF (Vchar_unify_table, c);    \
        if (INTEGERP (val))                             \
-         c = XINT (val);                               \
+         c = XFASTINT (val);                           \
        else if (! NILP (val))                          \
          c = maybe_unify_char (c, val);                \
       }                                                        \
@@ -592,6 +597,45 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
    : (c) <= 0xDFFF ? 2                 \
    : 0)
 
+/* Data type for Unicode general category.
+
+   The order of members must be in sync with the 8th element of the
+   member of unidata-prop-alist (in admin/unidata/unidata-getn.el) for
+   Unicode character property `general-category'.  */
+
+typedef enum {
+  UNICODE_CATEGORY_UNKNOWN = 0,
+  UNICODE_CATEGORY_Lu,
+  UNICODE_CATEGORY_Ll,
+  UNICODE_CATEGORY_Lt,
+  UNICODE_CATEGORY_Lm,
+  UNICODE_CATEGORY_Lo,
+  UNICODE_CATEGORY_Mn,
+  UNICODE_CATEGORY_Mc,
+  UNICODE_CATEGORY_Me,
+  UNICODE_CATEGORY_Nd,
+  UNICODE_CATEGORY_Nl,
+  UNICODE_CATEGORY_No,
+  UNICODE_CATEGORY_Pc,
+  UNICODE_CATEGORY_Pd,
+  UNICODE_CATEGORY_Ps,
+  UNICODE_CATEGORY_Pe,
+  UNICODE_CATEGORY_Pi,
+  UNICODE_CATEGORY_Pf,
+  UNICODE_CATEGORY_Po,
+  UNICODE_CATEGORY_Sm,
+  UNICODE_CATEGORY_Sc,
+  UNICODE_CATEGORY_Sk,
+  UNICODE_CATEGORY_So,
+  UNICODE_CATEGORY_Zs,
+  UNICODE_CATEGORY_Zl,
+  UNICODE_CATEGORY_Zp,
+  UNICODE_CATEGORY_Cc,
+  UNICODE_CATEGORY_Cf,
+  UNICODE_CATEGORY_Cs,
+  UNICODE_CATEGORY_Co,
+  UNICODE_CATEGORY_Cn
+} unicode_category_t;
 
 extern int char_resolve_modifier_mask (int);
 extern int char_string (unsigned, unsigned char *);
@@ -602,7 +646,7 @@ extern int translate_char (Lisp_Object, int c);
 extern int char_printable_p (int c);
 extern void parse_str_as_multibyte (const unsigned char *,
                                    EMACS_INT, EMACS_INT *, EMACS_INT *);
-extern EMACS_INT parse_str_to_multibyte (const unsigned char *, EMACS_INT);
+extern EMACS_INT count_size_as_multibyte (const unsigned char *, EMACS_INT);
 extern EMACS_INT str_as_multibyte (unsigned char *, EMACS_INT, EMACS_INT,
                             EMACS_INT *);
 extern EMACS_INT str_to_multibyte (unsigned char *, EMACS_INT, EMACS_INT);
@@ -612,10 +656,10 @@ extern EMACS_INT str_to_unibyte (const unsigned char *, unsigned char *,
 extern EMACS_INT strwidth (const char *, EMACS_INT);
 extern EMACS_INT c_string_width (const unsigned char *, EMACS_INT, int,
                                 EMACS_INT *, EMACS_INT *);
-extern EMACS_INT lisp_string_width (Lisp_Object, int,
+extern EMACS_INT lisp_string_width (Lisp_Object, EMACS_INT,
                                    EMACS_INT *, EMACS_INT *);
 
-extern Lisp_Object Qcharacterp, Qauto_fill_chars;
+extern Lisp_Object Qcharacterp;
 extern Lisp_Object Vchar_unify_table;
 extern Lisp_Object string_escape_byte8 (Lisp_Object);