--- /dev/null
+/*
+** Copyright 2011 Double Precision, Inc.
+** See COPYING for distribution information.
+**
+*/
+
+#include "unicode_config.h"
+#include "unicode.h"
+
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "wordbreaktab_internal.h"
+#include "wordbreaktab.h"
+
+struct unicode_wb_info {
+ int (*cb_func)(int, void *);
+ void *cb_arg;
+
+ uint8_t prevclass;
+ size_t wb4_cnt;
+
+ size_t wb4_extra_cnt;
+
+ int (*next_handler)(unicode_wb_info_t, uint8_t);
+ int (*end_handler)(unicode_wb_info_t);
+};
+
+static int sot(unicode_wb_info_t i, uint8_t cl);
+static int wb4(unicode_wb_info_t i);
+static int wb1and2_done(unicode_wb_info_t i, uint8_t cl);
+
+static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl);
+static int seen_wb67_end_handler(unicode_wb_info_t i);
+static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
+
+static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl);
+static int seen_wb1112_end_handler(unicode_wb_info_t i);
+static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
+
+unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
+ void *cb_arg)
+{
+ unicode_wb_info_t i=calloc(1, sizeof(struct unicode_wb_info));
+
+ if (!i)
+ return NULL;
+
+ i->next_handler=sot;
+ i->cb_func=cb_func;
+ i->cb_arg=cb_arg;
+ return i;
+}
+
+int unicode_wb_end(unicode_wb_info_t i)
+{
+ int rc;
+
+ if (i->end_handler)
+ rc=(*i->end_handler)(i);
+ else
+ rc=wb4(i);
+
+ free(i);
+ return rc;
+}
+
+int unicode_wb_next_cnt(unicode_wb_info_t i,
+ const unicode_char *chars,
+ size_t cnt)
+{
+ int rc;
+
+ while (cnt)
+ {
+ rc=unicode_wb_next(i, *chars++);
+ --cnt;
+ if (rc)
+ return rc;
+ }
+ return 0;
+}
+
+int unicode_wb_next(unicode_wb_info_t i, unicode_char ch)
+{
+ return (*i->next_handler)
+ (i, unicode_tab_lookup(ch,
+ unicode_indextab,
+ sizeof(unicode_indextab)
+ / sizeof(unicode_indextab[0]),
+ unicode_rangetab,
+ unicode_classtab,
+ UNICODE_WB_OTHER));
+}
+
+static int wb4(unicode_wb_info_t i)
+{
+ int rc=0;
+
+ while (i->wb4_cnt > 0)
+ {
+ --i->wb4_cnt;
+
+ if (rc == 0)
+ rc=(*i->cb_func)(0, i->cb_arg);
+ }
+ return rc;
+}
+
+static int result(unicode_wb_info_t i, int flag)
+{
+ int rc=wb4(i);
+
+ if (rc == 0)
+ rc=(*i->cb_func)(flag, i->cb_arg);
+
+ return rc;
+}
+
+#define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end)
+
+static int sot(unicode_wb_info_t i, uint8_t cl)
+{
+ i->prevclass=cl;
+ SET_HANDLER(wb1and2_done, NULL);
+
+ return result(i, 1); /* WB1 */
+}
+
+static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)
+{
+ uint8_t prevclass=i->prevclass;
+
+ i->prevclass=cl;
+
+ if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF)
+ return result(i, 0); /* WB3 */
+
+ switch (prevclass) {
+ case UNICODE_WB_CR:
+ case UNICODE_WB_LF:
+ case UNICODE_WB_Newline:
+ return result(i, 1); /* WB3a */
+ }
+
+ switch (cl) {
+ case UNICODE_WB_CR:
+ case UNICODE_WB_LF:
+ case UNICODE_WB_Newline:
+ return result(i, 1); /* WB3b */
+ }
+
+ if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
+ {
+ i->prevclass=prevclass;
+ ++i->wb4_cnt;
+ return 0; /* WB4 */
+ }
+
+ if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter)
+ {
+ return result(i, 0); /* WB5 */
+ }
+
+ if (prevclass == UNICODE_WB_ALetter &&
+ (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet))
+ {
+ i->wb4_extra_cnt=0;
+ SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler);
+ return 0;
+ }
+
+ return wb67_done(i, prevclass, cl);
+}
+
+/*
+** ALetter (MidLetter | MidNumLet ) ?
+**
+** prevclass cl
+**
+** Seen ALetter (MidLetter | MidNumLet), with the second character's status
+** not returned yet.
+*/
+
+static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
+{
+ int rc;
+ uint8_t prevclass;
+ size_t extra_cnt;
+
+ if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
+ {
+ ++i->wb4_extra_cnt;
+ return 0;
+ }
+
+ extra_cnt=i->wb4_extra_cnt;
+
+ /*
+ ** Reset the handler to the default, then check WB6
+ */
+
+ SET_HANDLER(wb1and2_done, NULL);
+
+ if (cl == UNICODE_WB_ALetter)
+ {
+ rc=result(i, 0); /* WB6 */
+ i->wb4_cnt=extra_cnt;
+
+ if (rc == 0)
+ rc=result(i, 0); /* WB7 */
+
+ i->prevclass=cl;
+
+ return rc;
+ }
+
+ prevclass=i->prevclass; /* This was the second character */
+
+ /*
+ ** Process the second character, starting with WB7
+ */
+
+ rc=wb67_done(i, UNICODE_WB_ALetter, prevclass);
+
+ i->prevclass=prevclass;
+ i->wb4_cnt=extra_cnt;
+
+ if (rc == 0)
+ rc=(*i->next_handler)(i, cl);
+ /* Process the current char now */
+
+ return rc;
+}
+
+/*
+** Seen ALetter (MidLetter | MidNumLet), with the second character's status
+** not returned yet, and now sot.
+*/
+
+static int seen_wb67_end_handler(unicode_wb_info_t i)
+{
+ int rc;
+ size_t extra_cnt=i->wb4_extra_cnt;
+
+ /*
+ ** Process the second character, starting with WB7.
+ */
+
+ rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass);
+ i->wb4_cnt=extra_cnt;
+ if (rc == 0)
+ rc=wb4(i);
+ return rc;
+}
+
+
+static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
+{
+ if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric)
+ return result(i, 0); /* WB8 */
+
+ if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric)
+ return result(i, 0); /* WB9 */
+
+ if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter)
+ return result(i, 0); /* WB10 */
+
+
+ if (prevclass == UNICODE_WB_Numeric &&
+ (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet))
+ {
+ i->wb4_extra_cnt=0;
+ SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler);
+ return 0;
+ }
+
+ return wb1112_done(i, prevclass, cl);
+}
+
+/*
+** Numeric (MidNum | MidNumLet ) ?
+**
+** prevclass cl
+**
+** Seen Numeric (MidNum | MidNumLet), with the second character's status
+** not returned yet.
+*/
+
+static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl)
+{
+ int rc;
+ uint8_t prevclass;
+ size_t extra_cnt;
+
+ if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
+ {
+ ++i->wb4_extra_cnt;
+ return 0;
+ }
+
+ extra_cnt=i->wb4_extra_cnt;
+
+ /*
+ ** Reset the handler to the default, then check WB6
+ */
+
+ SET_HANDLER(wb1and2_done, NULL);
+
+ if (cl == UNICODE_WB_Numeric)
+ {
+ rc=result(i, 0); /* WB11 */
+ i->wb4_cnt=extra_cnt;
+
+ if (rc == 0)
+ rc=result(i, 0); /* WB12 */
+
+ i->prevclass=cl;
+
+ return rc;
+ }
+
+ prevclass=i->prevclass; /* This was the second character */
+
+ /*
+ ** Process the second character, starting with WB7
+ */
+
+ rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass);
+
+ i->prevclass=prevclass;
+ i->wb4_cnt=extra_cnt;
+
+ if (rc == 0)
+ rc=(*i->next_handler)(i, cl);
+ /* Process the current char now */
+
+ return rc;
+}
+
+/*
+** Seen Numeric (MidNum | MidNumLet), with the second character's status
+** not returned yet, and now sot.
+*/
+
+static int seen_wb1112_end_handler(unicode_wb_info_t i)
+{
+ int rc;
+ size_t extra_cnt=i->wb4_extra_cnt;
+
+ /*
+ ** Process the second character, starting with WB11.
+ */
+
+ rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass);
+ i->wb4_cnt=extra_cnt;
+ if (rc == 0)
+ rc=wb4(i);
+ return rc;
+}
+
+static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
+{
+ if (prevclass == UNICODE_WB_Katakana &&
+ cl == UNICODE_WB_Katakana)
+ return result(i, 0); /* WB13 */
+
+ switch (prevclass) {
+ case UNICODE_WB_ALetter:
+ case UNICODE_WB_Numeric:
+ case UNICODE_WB_Katakana:
+ case UNICODE_WB_ExtendNumLet:
+ if (cl == UNICODE_WB_ExtendNumLet)
+ return result(i, 0); /* WB13a */
+ }
+
+ if (prevclass == UNICODE_WB_ExtendNumLet)
+ switch (cl) {
+ case UNICODE_WB_ALetter:
+ case UNICODE_WB_Numeric:
+ case UNICODE_WB_Katakana:
+ return result(i, 0); /* WB13b */
+ }
+
+ return result(i, 1); /* WB14 */
+}
+
+/* --------------------------------------------------------------------- */
+
+struct unicode_wbscan_info {
+ unicode_wb_info_t wb_handle;
+
+ int found;
+ size_t cnt;
+};
+
+static int unicode_wbscan_callback(int, void *);
+
+unicode_wbscan_info_t unicode_wbscan_init()
+{
+ unicode_wbscan_info_t i=calloc(1, sizeof(struct unicode_wbscan_info));
+
+ if (!i)
+ return NULL;
+
+ if ((i->wb_handle=unicode_wb_init(unicode_wbscan_callback, i)) == NULL)
+ {
+ free(i);
+ return NULL;
+ }
+
+ return i;
+}
+
+int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch)
+{
+ if (!i->found)
+ unicode_wb_next(i->wb_handle, ch);
+
+ return i->found;
+}
+
+size_t unicode_wbscan_end(unicode_wbscan_info_t i)
+{
+ size_t n;
+
+ unicode_wb_end(i->wb_handle);
+
+ n=i->cnt;
+ free(i);
+ return n;
+}
+
+static int unicode_wbscan_callback(int flag, void *arg)
+{
+ unicode_wbscan_info_t i=(unicode_wbscan_info_t)arg;
+
+ if (flag && i->cnt > 0)
+ i->found=1;
+
+ if (!i->found)
+ ++i->cnt;
+ return 0;
+}
+