Imported Debian patch 0.66.1-1
[hcoop/debian/courier-authlib.git] / libs / unicode / unicode_wordbreak.c
diff --git a/libs/unicode/unicode_wordbreak.c b/libs/unicode/unicode_wordbreak.c
new file mode 100644 (file)
index 0000000..dee4b52
--- /dev/null
@@ -0,0 +1,448 @@
+/*
+** Copyright 2011 Double Precision, Inc.
+** See COPYING for distribution information.
+**
+*/
+
+#include       "unicode_config.h"
+#include       "unicode.h"
+
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "wordbreaktab_internal.h"
+#include "wordbreaktab.h"
+
+struct unicode_wb_info {
+       int (*cb_func)(int, void *);
+       void *cb_arg;
+
+       uint8_t prevclass;
+       size_t wb4_cnt;
+
+       size_t wb4_extra_cnt;
+
+       int (*next_handler)(unicode_wb_info_t, uint8_t);
+       int (*end_handler)(unicode_wb_info_t);
+};
+
+static int sot(unicode_wb_info_t i, uint8_t cl);
+static int wb4(unicode_wb_info_t i);
+static int wb1and2_done(unicode_wb_info_t i, uint8_t cl);
+
+static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl);
+static int seen_wb67_end_handler(unicode_wb_info_t i);
+static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
+
+static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl);
+static int seen_wb1112_end_handler(unicode_wb_info_t i);
+static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
+
+unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
+                                 void *cb_arg)
+{
+       unicode_wb_info_t i=calloc(1, sizeof(struct unicode_wb_info));
+
+       if (!i)
+               return NULL;
+
+       i->next_handler=sot;
+       i->cb_func=cb_func;
+       i->cb_arg=cb_arg;
+       return i;
+}
+
+int unicode_wb_end(unicode_wb_info_t i)
+{
+       int rc;
+
+       if (i->end_handler)
+               rc=(*i->end_handler)(i);
+       else
+               rc=wb4(i);
+
+       free(i);
+       return rc;
+}
+
+int unicode_wb_next_cnt(unicode_wb_info_t i,
+                       const unicode_char *chars,
+                       size_t cnt)
+{
+       int rc;
+
+       while (cnt)
+       {
+               rc=unicode_wb_next(i, *chars++);
+               --cnt;
+               if (rc)
+                       return rc;
+       }
+       return 0;
+}
+
+int unicode_wb_next(unicode_wb_info_t i, unicode_char ch)
+{
+       return (*i->next_handler)
+               (i, unicode_tab_lookup(ch,
+                                      unicode_indextab,
+                                      sizeof(unicode_indextab)
+                                      / sizeof(unicode_indextab[0]),
+                                      unicode_rangetab,
+                                      unicode_classtab,
+                                      UNICODE_WB_OTHER));
+}
+
+static int wb4(unicode_wb_info_t i)
+{
+       int rc=0;
+
+       while (i->wb4_cnt > 0)
+       {
+               --i->wb4_cnt;
+
+               if (rc == 0)
+                       rc=(*i->cb_func)(0, i->cb_arg);
+       }
+       return rc;
+}
+
+static int result(unicode_wb_info_t i, int flag)
+{
+       int rc=wb4(i);
+
+       if (rc == 0)
+               rc=(*i->cb_func)(flag, i->cb_arg);
+
+       return rc;
+}
+
+#define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end)
+
+static int sot(unicode_wb_info_t i, uint8_t cl)
+{
+       i->prevclass=cl;
+       SET_HANDLER(wb1and2_done, NULL);
+
+       return result(i, 1);    /* WB1 */
+}
+
+static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)
+{
+       uint8_t prevclass=i->prevclass;
+
+       i->prevclass=cl;
+
+       if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF)
+               return result(i, 0); /* WB3 */
+
+       switch (prevclass) {
+       case UNICODE_WB_CR:
+       case UNICODE_WB_LF:
+       case UNICODE_WB_Newline:
+               return result(i, 1); /* WB3a */
+       }
+
+       switch (cl) {
+       case UNICODE_WB_CR:
+       case UNICODE_WB_LF:
+       case UNICODE_WB_Newline:
+               return result(i, 1); /* WB3b */
+       }
+
+       if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
+       {
+               i->prevclass=prevclass;
+               ++i->wb4_cnt;
+               return 0; /* WB4 */
+       }
+
+       if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter)
+       {
+               return result(i, 0); /* WB5 */
+       }
+
+       if (prevclass == UNICODE_WB_ALetter &&
+           (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet))
+       {
+               i->wb4_extra_cnt=0;
+               SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler);
+               return 0;
+       }
+
+       return wb67_done(i, prevclass, cl);
+}
+
+/*
+**              ALetter     (MidLetter | MidNumLet )     ?
+**
+**                                  prevclass            cl
+**
+** Seen ALetter (MidLetter | MidNumLet), with the second character's status
+** not returned yet.
+*/
+
+static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
+{
+       int rc;
+       uint8_t prevclass;
+       size_t extra_cnt;
+
+       if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
+       {
+               ++i->wb4_extra_cnt;
+               return 0;
+       }
+
+       extra_cnt=i->wb4_extra_cnt;
+
+       /*
+       ** Reset the handler to the default, then check WB6
+       */
+
+       SET_HANDLER(wb1and2_done, NULL);
+
+       if (cl == UNICODE_WB_ALetter)
+       {
+               rc=result(i, 0); /* WB6 */
+               i->wb4_cnt=extra_cnt;
+
+               if (rc == 0)
+                       rc=result(i, 0); /* WB7 */
+
+               i->prevclass=cl;
+                       
+               return rc;
+       }
+
+       prevclass=i->prevclass; /* This was the second character */
+
+       /*
+       ** Process the second character, starting with WB7
+       */
+
+       rc=wb67_done(i, UNICODE_WB_ALetter, prevclass);
+
+       i->prevclass=prevclass;
+       i->wb4_cnt=extra_cnt;
+
+       if (rc == 0)
+               rc=(*i->next_handler)(i, cl);
+       /* Process the current char now */
+
+       return rc;
+}
+
+/*
+** Seen ALetter (MidLetter | MidNumLet), with the second character's status
+** not returned yet, and now sot.
+*/
+
+static int seen_wb67_end_handler(unicode_wb_info_t i)
+{
+       int rc;
+       size_t extra_cnt=i->wb4_extra_cnt;
+
+       /*
+       ** Process the second character, starting with WB7.
+       */
+
+       rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass);
+       i->wb4_cnt=extra_cnt;
+       if (rc == 0)
+               rc=wb4(i);
+       return rc;
+}
+
+
+static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
+{
+       if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric)
+               return result(i, 0); /* WB8 */
+
+       if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric)
+               return result(i, 0); /* WB9 */
+
+       if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter)
+               return result(i, 0); /* WB10 */
+
+
+       if (prevclass == UNICODE_WB_Numeric &&
+           (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet))
+       {
+               i->wb4_extra_cnt=0;
+               SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler);
+               return 0;
+       }
+
+       return wb1112_done(i, prevclass, cl);
+}
+
+/*
+**              Numeric     (MidNum | MidNumLet )     ?
+**
+**                               prevclass            cl
+**
+** Seen Numeric (MidNum | MidNumLet), with the second character's status
+** not returned yet.
+*/
+
+static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl)
+{
+       int rc;
+       uint8_t prevclass;
+       size_t extra_cnt;
+
+       if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
+       {
+               ++i->wb4_extra_cnt;
+               return 0;
+       }
+
+       extra_cnt=i->wb4_extra_cnt;
+
+       /*
+       ** Reset the handler to the default, then check WB6
+       */
+
+       SET_HANDLER(wb1and2_done, NULL);
+
+       if (cl == UNICODE_WB_Numeric)
+       {
+               rc=result(i, 0); /* WB11 */
+               i->wb4_cnt=extra_cnt;
+
+               if (rc == 0)
+                       rc=result(i, 0); /* WB12 */
+
+               i->prevclass=cl;
+                       
+               return rc;
+       }
+
+       prevclass=i->prevclass; /* This was the second character */
+
+       /*
+       ** Process the second character, starting with WB7
+       */
+
+       rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass);
+
+       i->prevclass=prevclass;
+       i->wb4_cnt=extra_cnt;
+
+       if (rc == 0)
+               rc=(*i->next_handler)(i, cl);
+       /* Process the current char now */
+
+       return rc;
+}
+
+/*
+** Seen Numeric (MidNum | MidNumLet), with the second character's status
+** not returned yet, and now sot.
+*/
+
+static int seen_wb1112_end_handler(unicode_wb_info_t i)
+{
+       int rc;
+       size_t extra_cnt=i->wb4_extra_cnt;
+
+       /*
+       ** Process the second character, starting with WB11.
+       */
+
+       rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass);
+       i->wb4_cnt=extra_cnt;
+       if (rc == 0)
+               rc=wb4(i);
+       return rc;
+}
+
+static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
+{
+       if (prevclass == UNICODE_WB_Katakana &&
+           cl == UNICODE_WB_Katakana)
+               return result(i, 0); /* WB13 */
+
+       switch (prevclass) {
+       case UNICODE_WB_ALetter:
+       case UNICODE_WB_Numeric:
+       case UNICODE_WB_Katakana:
+       case UNICODE_WB_ExtendNumLet:
+               if (cl == UNICODE_WB_ExtendNumLet)
+                       return result(i, 0); /* WB13a */
+       }
+
+       if (prevclass == UNICODE_WB_ExtendNumLet)
+               switch (cl) {
+               case UNICODE_WB_ALetter:
+               case UNICODE_WB_Numeric:
+               case UNICODE_WB_Katakana:
+                       return result(i, 0); /* WB13b */
+               }
+
+       return result(i, 1); /* WB14 */
+}
+
+/* --------------------------------------------------------------------- */
+
+struct unicode_wbscan_info {
+       unicode_wb_info_t wb_handle;
+
+       int found;
+       size_t cnt;
+};
+
+static int unicode_wbscan_callback(int, void *);
+
+unicode_wbscan_info_t unicode_wbscan_init()
+{
+       unicode_wbscan_info_t i=calloc(1, sizeof(struct unicode_wbscan_info));
+
+       if (!i)
+               return NULL;
+
+       if ((i->wb_handle=unicode_wb_init(unicode_wbscan_callback, i)) == NULL)
+       {
+               free(i);
+               return NULL;
+       }
+
+       return i;
+}
+
+int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch)
+{
+       if (!i->found)
+               unicode_wb_next(i->wb_handle, ch);
+
+       return i->found;
+}
+
+size_t unicode_wbscan_end(unicode_wbscan_info_t i)
+{
+       size_t n;
+
+       unicode_wb_end(i->wb_handle);
+
+       n=i->cnt;
+       free(i);
+       return n;
+}
+
+static int unicode_wbscan_callback(int flag, void *arg)
+{
+       unicode_wbscan_info_t i=(unicode_wbscan_info_t)arg;
+
+       if (flag && i->cnt > 0)
+               i->found=1;
+
+       if (!i->found)
+               ++i->cnt;
+       return 0;
+}
+