-/*
-** Copyright 2011 Double Precision, Inc.
-** See COPYING for distribution information.
-**
-*/
-
-#include "unicode_config.h"
-#include "unicode.h"
-
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include "wordbreaktab_internal.h"
-#include "wordbreaktab.h"
-
-struct unicode_wb_info {
- int (*cb_func)(int, void *);
- void *cb_arg;
-
- uint8_t prevclass;
- size_t wb4_cnt;
-
- size_t wb4_extra_cnt;
-
- int (*next_handler)(unicode_wb_info_t, uint8_t);
- int (*end_handler)(unicode_wb_info_t);
-};
-
-static int sot(unicode_wb_info_t i, uint8_t cl);
-static int wb4(unicode_wb_info_t i);
-static int wb1and2_done(unicode_wb_info_t i, uint8_t cl);
-
-static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl);
-static int seen_wb67_end_handler(unicode_wb_info_t i);
-static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
-
-static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl);
-static int seen_wb1112_end_handler(unicode_wb_info_t i);
-static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
-
-unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
- void *cb_arg)
-{
- unicode_wb_info_t i=calloc(1, sizeof(struct unicode_wb_info));
-
- if (!i)
- return NULL;
-
- i->next_handler=sot;
- i->cb_func=cb_func;
- i->cb_arg=cb_arg;
- return i;
-}
-
-int unicode_wb_end(unicode_wb_info_t i)
-{
- int rc;
-
- if (i->end_handler)
- rc=(*i->end_handler)(i);
- else
- rc=wb4(i);
-
- free(i);
- return rc;
-}
-
-int unicode_wb_next_cnt(unicode_wb_info_t i,
- const unicode_char *chars,
- size_t cnt)
-{
- int rc;
-
- while (cnt)
- {
- rc=unicode_wb_next(i, *chars++);
- --cnt;
- if (rc)
- return rc;
- }
- return 0;
-}
-
-int unicode_wb_next(unicode_wb_info_t i, unicode_char ch)
-{
- return (*i->next_handler)
- (i, unicode_tab_lookup(ch,
- unicode_indextab,
- sizeof(unicode_indextab)
- / sizeof(unicode_indextab[0]),
- unicode_rangetab,
- unicode_classtab,
- UNICODE_WB_OTHER));
-}
-
-static int wb4(unicode_wb_info_t i)
-{
- int rc=0;
-
- while (i->wb4_cnt > 0)
- {
- --i->wb4_cnt;
-
- if (rc == 0)
- rc=(*i->cb_func)(0, i->cb_arg);
- }
- return rc;
-}
-
-static int result(unicode_wb_info_t i, int flag)
-{
- int rc=wb4(i);
-
- if (rc == 0)
- rc=(*i->cb_func)(flag, i->cb_arg);
-
- return rc;
-}
-
-#define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end)
-
-static int sot(unicode_wb_info_t i, uint8_t cl)
-{
- i->prevclass=cl;
- SET_HANDLER(wb1and2_done, NULL);
-
- return result(i, 1); /* WB1 */
-}
-
-static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)
-{
- uint8_t prevclass=i->prevclass;
-
- i->prevclass=cl;
-
- if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF)
- return result(i, 0); /* WB3 */
-
- switch (prevclass) {
- case UNICODE_WB_CR:
- case UNICODE_WB_LF:
- case UNICODE_WB_Newline:
- return result(i, 1); /* WB3a */
- }
-
- switch (cl) {
- case UNICODE_WB_CR:
- case UNICODE_WB_LF:
- case UNICODE_WB_Newline:
- return result(i, 1); /* WB3b */
- }
-
- if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
- {
- i->prevclass=prevclass;
- ++i->wb4_cnt;
- return 0; /* WB4 */
- }
-
- if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter)
- {
- return result(i, 0); /* WB5 */
- }
-
- if (prevclass == UNICODE_WB_ALetter &&
- (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet))
- {
- i->wb4_extra_cnt=0;
- SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler);
- return 0;
- }
-
- return wb67_done(i, prevclass, cl);
-}
-
-/*
-** ALetter (MidLetter | MidNumLet ) ?
-**
-** prevclass cl
-**
-** Seen ALetter (MidLetter | MidNumLet), with the second character's status
-** not returned yet.
-*/
-
-static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
-{
- int rc;
- uint8_t prevclass;
- size_t extra_cnt;
-
- if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
- {
- ++i->wb4_extra_cnt;
- return 0;
- }
-
- extra_cnt=i->wb4_extra_cnt;
-
- /*
- ** Reset the handler to the default, then check WB6
- */
-
- SET_HANDLER(wb1and2_done, NULL);
-
- if (cl == UNICODE_WB_ALetter)
- {
- rc=result(i, 0); /* WB6 */
- i->wb4_cnt=extra_cnt;
-
- if (rc == 0)
- rc=result(i, 0); /* WB7 */
-
- i->prevclass=cl;
-
- return rc;
- }
-
- prevclass=i->prevclass; /* This was the second character */
-
- /*
- ** Process the second character, starting with WB7
- */
-
- rc=wb67_done(i, UNICODE_WB_ALetter, prevclass);
-
- i->prevclass=prevclass;
- i->wb4_cnt=extra_cnt;
-
- if (rc == 0)
- rc=(*i->next_handler)(i, cl);
- /* Process the current char now */
-
- return rc;
-}
-
-/*
-** Seen ALetter (MidLetter | MidNumLet), with the second character's status
-** not returned yet, and now sot.
-*/
-
-static int seen_wb67_end_handler(unicode_wb_info_t i)
-{
- int rc;
- size_t extra_cnt=i->wb4_extra_cnt;
-
- /*
- ** Process the second character, starting with WB7.
- */
-
- rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass);
- i->wb4_cnt=extra_cnt;
- if (rc == 0)
- rc=wb4(i);
- return rc;
-}
-
-
-static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
-{
- if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric)
- return result(i, 0); /* WB8 */
-
- if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric)
- return result(i, 0); /* WB9 */
-
- if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter)
- return result(i, 0); /* WB10 */
-
-
- if (prevclass == UNICODE_WB_Numeric &&
- (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet))
- {
- i->wb4_extra_cnt=0;
- SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler);
- return 0;
- }
-
- return wb1112_done(i, prevclass, cl);
-}
-
-/*
-** Numeric (MidNum | MidNumLet ) ?
-**
-** prevclass cl
-**
-** Seen Numeric (MidNum | MidNumLet), with the second character's status
-** not returned yet.
-*/
-
-static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl)
-{
- int rc;
- uint8_t prevclass;
- size_t extra_cnt;
-
- if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
- {
- ++i->wb4_extra_cnt;
- return 0;
- }
-
- extra_cnt=i->wb4_extra_cnt;
-
- /*
- ** Reset the handler to the default, then check WB6
- */
-
- SET_HANDLER(wb1and2_done, NULL);
-
- if (cl == UNICODE_WB_Numeric)
- {
- rc=result(i, 0); /* WB11 */
- i->wb4_cnt=extra_cnt;
-
- if (rc == 0)
- rc=result(i, 0); /* WB12 */
-
- i->prevclass=cl;
-
- return rc;
- }
-
- prevclass=i->prevclass; /* This was the second character */
-
- /*
- ** Process the second character, starting with WB7
- */
-
- rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass);
-
- i->prevclass=prevclass;
- i->wb4_cnt=extra_cnt;
-
- if (rc == 0)
- rc=(*i->next_handler)(i, cl);
- /* Process the current char now */
-
- return rc;
-}
-
-/*
-** Seen Numeric (MidNum | MidNumLet), with the second character's status
-** not returned yet, and now sot.
-*/
-
-static int seen_wb1112_end_handler(unicode_wb_info_t i)
-{
- int rc;
- size_t extra_cnt=i->wb4_extra_cnt;
-
- /*
- ** Process the second character, starting with WB11.
- */
-
- rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass);
- i->wb4_cnt=extra_cnt;
- if (rc == 0)
- rc=wb4(i);
- return rc;
-}
-
-static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
-{
- if (prevclass == UNICODE_WB_Katakana &&
- cl == UNICODE_WB_Katakana)
- return result(i, 0); /* WB13 */
-
- switch (prevclass) {
- case UNICODE_WB_ALetter:
- case UNICODE_WB_Numeric:
- case UNICODE_WB_Katakana:
- case UNICODE_WB_ExtendNumLet:
- if (cl == UNICODE_WB_ExtendNumLet)
- return result(i, 0); /* WB13a */
- }
-
- if (prevclass == UNICODE_WB_ExtendNumLet)
- switch (cl) {
- case UNICODE_WB_ALetter:
- case UNICODE_WB_Numeric:
- case UNICODE_WB_Katakana:
- return result(i, 0); /* WB13b */
- }
-
- return result(i, 1); /* WB14 */
-}
-
-/* --------------------------------------------------------------------- */
-
-struct unicode_wbscan_info {
- unicode_wb_info_t wb_handle;
-
- int found;
- size_t cnt;
-};
-
-static int unicode_wbscan_callback(int, void *);
-
-unicode_wbscan_info_t unicode_wbscan_init()
-{
- unicode_wbscan_info_t i=calloc(1, sizeof(struct unicode_wbscan_info));
-
- if (!i)
- return NULL;
-
- if ((i->wb_handle=unicode_wb_init(unicode_wbscan_callback, i)) == NULL)
- {
- free(i);
- return NULL;
- }
-
- return i;
-}
-
-int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch)
-{
- if (!i->found)
- unicode_wb_next(i->wb_handle, ch);
-
- return i->found;
-}
-
-size_t unicode_wbscan_end(unicode_wbscan_info_t i)
-{
- size_t n;
-
- unicode_wb_end(i->wb_handle);
-
- n=i->cnt;
- free(i);
- return n;
-}
-
-static int unicode_wbscan_callback(int flag, void *arg)
-{
- unicode_wbscan_info_t i=(unicode_wbscan_info_t)arg;
-
- if (flag && i->cnt > 0)
- i->found=1;
-
- if (!i->found)
- ++i->cnt;
- return 0;
-}
-