--- /dev/null
+/*
+** Copyright 2011 Double Precision, Inc.
+** See COPYING for distribution information.
+**
+*/
+
+#include "unicode_config.h"
+#include "unicode.h"
+
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "linebreaktab_internal.h"
+
+#include "linebreaktab.h"
+
+#define UNICODE_LB_SOT 0xFF
+
+struct unicode_lb_info {
+ int (*cb_func)(int, void *);
+ void *cb_arg;
+
+ int opts;
+
+ uint8_t savedclass;
+ size_t savedcmcnt;
+
+ uint8_t prevclass;
+ uint8_t prevclass_nsp;
+
+ int (*next_handler)(struct unicode_lb_info *, uint8_t);
+ int (*end_handler)(struct unicode_lb_info *);
+};
+
+
+/* http://www.unicode.org/reports/tr14/#Algorithm */
+
+static int next_def(unicode_lb_info_t, uint8_t);
+static int end_def(unicode_lb_info_t);
+
+static int next_lb25_seenophy(unicode_lb_info_t, uint8_t);
+static int end_lb25_seenophy(unicode_lb_info_t);
+
+static int next_lb25_seennu(unicode_lb_info_t, uint8_t);
+
+static int next_lb25_seennuclcp(unicode_lb_info_t, uint8_t);
+
+static void unicode_lb_reset(unicode_lb_info_t i)
+{
+ i->prevclass=i->prevclass_nsp=UNICODE_LB_SOT;
+ i->next_handler=next_def;
+ i->end_handler=end_def;
+}
+
+unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
+ void *cb_arg)
+{
+ unicode_lb_info_t i=calloc(1, sizeof(struct unicode_lb_info));
+
+ i->cb_func=cb_func;
+ i->cb_arg=cb_arg;
+
+ unicode_lb_reset(i);
+ return i;
+}
+
+int unicode_lb_end(unicode_lb_info_t i)
+{
+ int rc=(*i->end_handler)(i);
+
+ free(i);
+ return rc;
+}
+
+void unicode_lb_set_opts(unicode_lb_info_t i, int opts)
+{
+ i->opts=opts;
+}
+
+/* Default end handler has nothing to do */
+
+static int end_def(unicode_lb_info_t i)
+{
+ /* LB3 N/A */
+ return 0;
+}
+#define RESULT(x) (*i->cb_func)((x), i->cb_arg)
+
+int unicode_lb_next_cnt(unicode_lb_info_t i,
+ const unicode_char *chars,
+ size_t cnt)
+{
+ while (cnt)
+ {
+ int rc=unicode_lb_next(i, *chars);
+
+ if (rc)
+ return rc;
+
+ ++chars;
+ --cnt;
+ }
+ return 0;
+}
+
+int unicode_lb_lookup(unicode_char ch)
+{
+ return unicode_tab_lookup(ch,
+ unicode_indextab,
+ sizeof(unicode_indextab)
+ / sizeof(unicode_indextab[0]),
+ unicode_rangetab,
+ unicode_classtab,
+ UNICODE_LB_AL /* XX, LB1 */);
+}
+
+int unicode_lb_next(unicode_lb_info_t i,
+ unicode_char ch)
+{
+ return (*i->next_handler)(i, (i->opts & UNICODE_LB_OPT_DASHWJ) &&
+ (ch == 0x2012 || ch == 0x2013)
+ ? UNICODE_LB_WJ:unicode_lb_lookup(ch));
+}
+
+static int next_def_nolb25(unicode_lb_info_t i,
+ uint8_t uclass,
+ int nolb25);
+
+/*
+** Default logic for next unicode char.
+*/
+static int next_def(unicode_lb_info_t i,
+ uint8_t uclass)
+{
+ return next_def_nolb25(i, uclass, 0);
+}
+
+static int next_def_nolb25(unicode_lb_info_t i,
+ uint8_t uclass,
+
+ /* Flag -- recursively invoked after discarding LB25 */
+ int nolb25)
+{
+
+ /* Retrieve the previous unicode character's linebreak class. */
+
+ uint8_t prevclass=i->prevclass;
+ uint8_t prevclass_nsp=i->prevclass_nsp;
+
+ /* Save this unicode char's linebreak class, for the next goaround */
+ i->prevclass=uclass;
+
+ if (uclass != UNICODE_LB_SP)
+ i->prevclass_nsp=uclass;
+
+ if (uclass == UNICODE_LB_NU)
+ i->next_handler=next_lb25_seennu; /* LB25 */
+
+ if (prevclass == UNICODE_LB_SOT)
+ {
+ if (uclass == UNICODE_LB_CM) /* LB9 */
+ i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
+
+ return RESULT(UNICODE_LB_NONE); /* LB2 */
+ }
+
+ if (prevclass == UNICODE_LB_CR && uclass == UNICODE_LB_LF)
+ return RESULT(UNICODE_LB_NONE); /* LB5 */
+
+ switch (prevclass) {
+ case UNICODE_LB_BK:
+ case UNICODE_LB_CR:
+ case UNICODE_LB_LF:
+ case UNICODE_LB_NL:
+
+ if (uclass == UNICODE_LB_CM)
+ {
+ i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
+ /* LB9 */
+ }
+
+ return RESULT(UNICODE_LB_MANDATORY); /* LB4, LB5 */
+
+ case UNICODE_LB_SP:
+ case UNICODE_LB_ZW:
+ if (uclass == UNICODE_LB_CM)
+ i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
+ /* LB10 */
+ break;
+ default:
+ break;
+ }
+
+ switch (uclass) {
+
+ /* LB6: */
+ case UNICODE_LB_BK:
+ case UNICODE_LB_CR:
+ case UNICODE_LB_LF:
+ case UNICODE_LB_NL:
+
+ /* LB7: */
+ case UNICODE_LB_SP:
+ case UNICODE_LB_ZW:
+
+ return RESULT(UNICODE_LB_NONE);
+ default:
+ break;
+ }
+
+ if (prevclass_nsp == UNICODE_LB_ZW)
+ return RESULT(UNICODE_LB_ALLOWED); /* LB8 */
+
+ if (uclass == UNICODE_LB_CM)
+ {
+ i->prevclass=prevclass;
+ i->prevclass_nsp=prevclass_nsp;
+ return RESULT(UNICODE_LB_NONE); /* LB9 */
+ }
+
+ if (prevclass == UNICODE_LB_WJ || uclass == UNICODE_LB_WJ)
+ return RESULT(UNICODE_LB_NONE); /* LB11 */
+
+ if (prevclass == UNICODE_LB_GL)
+ return RESULT(UNICODE_LB_NONE); /* LB12 */
+
+ if (uclass == UNICODE_LB_GL &&
+ prevclass != UNICODE_LB_SP &&
+ prevclass != UNICODE_LB_BA &&
+ prevclass != UNICODE_LB_HY)
+ return RESULT(UNICODE_LB_NONE); /* LB12a */
+
+
+ switch (uclass) {
+ case UNICODE_LB_SY:
+ if (i->opts & UNICODE_LB_OPT_SYBREAK)
+ {
+ if (prevclass == UNICODE_LB_SP)
+ return RESULT(UNICODE_LB_ALLOWED);
+ }
+
+ case UNICODE_LB_CL:
+ case UNICODE_LB_CP:
+ case UNICODE_LB_EX:
+ case UNICODE_LB_IS:
+ return RESULT(UNICODE_LB_NONE); /* LB13 */
+ default:
+ break;
+ }
+
+ if ((i->opts & UNICODE_LB_OPT_SYBREAK) && prevclass == UNICODE_LB_SY)
+ switch (uclass) {
+ case UNICODE_LB_EX:
+ case UNICODE_LB_AL:
+ case UNICODE_LB_ID:
+ return RESULT(UNICODE_LB_NONE);
+ }
+
+ if (prevclass_nsp == UNICODE_LB_OP)
+ return RESULT(UNICODE_LB_NONE); /* LB14 */
+
+ if (prevclass_nsp == UNICODE_LB_QU && uclass == UNICODE_LB_OP)
+ return RESULT(UNICODE_LB_NONE); /* LB15 */
+
+ if ((prevclass_nsp == UNICODE_LB_CL || prevclass_nsp == UNICODE_LB_CP)
+ && uclass == UNICODE_LB_NS)
+ return RESULT(UNICODE_LB_NONE); /* LB16 */
+
+ if (prevclass_nsp == UNICODE_LB_B2 && uclass == UNICODE_LB_B2)
+ return RESULT(UNICODE_LB_NONE); /* LB17 */
+
+ if (prevclass == UNICODE_LB_SP)
+ return RESULT(UNICODE_LB_ALLOWED); /* LB18 */
+
+ if (uclass == UNICODE_LB_QU || prevclass == UNICODE_LB_QU)
+ return RESULT(UNICODE_LB_NONE); /* LB19 */
+
+ if (uclass == UNICODE_LB_CB || prevclass == UNICODE_LB_CB)
+ return RESULT(UNICODE_LB_ALLOWED); /* LB20 */
+
+ /* LB21: */
+
+ switch (uclass) {
+ case UNICODE_LB_BA:
+ case UNICODE_LB_HY:
+ case UNICODE_LB_NS:
+ return RESULT(UNICODE_LB_NONE);
+ default:
+ break;
+ }
+
+ if (prevclass == UNICODE_LB_BB)
+ return RESULT(UNICODE_LB_NONE);
+
+ if (uclass == UNICODE_LB_IN)
+ switch (prevclass) {
+ case UNICODE_LB_AL:
+ case UNICODE_LB_ID:
+ case UNICODE_LB_IN:
+ case UNICODE_LB_NU:
+ return RESULT(UNICODE_LB_NONE); /* LB22 */
+ default:
+ break;
+ }
+
+
+ if (prevclass == UNICODE_LB_ID && uclass == UNICODE_LB_PO)
+ return RESULT(UNICODE_LB_NONE); /* LB23 */
+ if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_NU)
+ return RESULT(UNICODE_LB_NONE); /* LB23 */
+
+ if (prevclass == UNICODE_LB_NU && uclass == UNICODE_LB_AL)
+ return RESULT(UNICODE_LB_NONE); /* LB23 */
+
+
+ if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_ID)
+ return RESULT(UNICODE_LB_NONE); /* LB24 */
+ if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_AL)
+ return RESULT(UNICODE_LB_NONE); /* LB24 */
+ if (prevclass == UNICODE_LB_PO && uclass == UNICODE_LB_AL)
+ return RESULT(UNICODE_LB_NONE); /* LB24 */
+
+ if ((i->opts & UNICODE_LB_OPT_PRBREAK) && uclass == UNICODE_LB_PR)
+ switch (prevclass) {
+ case UNICODE_LB_PR:
+ case UNICODE_LB_AL:
+ case UNICODE_LB_ID:
+ return RESULT(UNICODE_LB_NONE);
+ }
+
+ if (!nolb25 &&
+ (prevclass == UNICODE_LB_PR || prevclass == UNICODE_LB_PO))
+ {
+ if (uclass == UNICODE_LB_NU)
+ return RESULT(UNICODE_LB_NONE); /* LB25 */
+
+ if (uclass == UNICODE_LB_OP || uclass == UNICODE_LB_HY)
+ {
+ i->prevclass=prevclass;
+ i->prevclass_nsp=prevclass_nsp;
+
+ i->savedclass=uclass;
+ i->savedcmcnt=0;
+ i->next_handler=next_lb25_seenophy;
+ i->end_handler=end_lb25_seenophy;
+ return 0;
+ }
+ }
+
+ if ((prevclass == UNICODE_LB_OP || prevclass == UNICODE_LB_HY) &&
+ uclass == UNICODE_LB_NU)
+ return RESULT(UNICODE_LB_NONE); /* LB25 */
+
+ /*****/
+
+ if (prevclass == UNICODE_LB_JL)
+ switch (uclass) {
+ case UNICODE_LB_JL:
+ case UNICODE_LB_JV:
+ case UNICODE_LB_H2:
+ case UNICODE_LB_H3:
+ return RESULT(UNICODE_LB_NONE); /* LB26 */
+ default:
+ break;
+ }
+
+ if ((prevclass == UNICODE_LB_JV ||
+ prevclass == UNICODE_LB_H2) &&
+ (uclass == UNICODE_LB_JV ||
+ uclass == UNICODE_LB_JT))
+ return RESULT(UNICODE_LB_NONE); /* LB26 */
+
+ if ((prevclass == UNICODE_LB_JT ||
+ prevclass == UNICODE_LB_H3) &&
+ uclass == UNICODE_LB_JT)
+ return RESULT(UNICODE_LB_NONE); /* LB26 */
+
+
+ switch (prevclass) {
+ case UNICODE_LB_JL:
+ case UNICODE_LB_JV:
+ case UNICODE_LB_JT:
+ case UNICODE_LB_H2:
+ case UNICODE_LB_H3:
+ if (uclass == UNICODE_LB_IN || uclass == UNICODE_LB_PO)
+ return RESULT(UNICODE_LB_NONE); /* LB27 */
+ default:
+ break;
+ }
+
+ switch (uclass) {
+ case UNICODE_LB_JL:
+ case UNICODE_LB_JV:
+ case UNICODE_LB_JT:
+ case UNICODE_LB_H2:
+ case UNICODE_LB_H3:
+ if (prevclass == UNICODE_LB_PR)
+ return RESULT(UNICODE_LB_NONE); /* LB27 */
+ default:
+ break;
+ }
+
+ if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_AL)
+ return RESULT(UNICODE_LB_NONE); /* LB28 */
+
+ if (prevclass == UNICODE_LB_IS && uclass == UNICODE_LB_AL)
+ return RESULT(UNICODE_LB_NONE); /* LB29 */
+
+ if ((prevclass == UNICODE_LB_AL || prevclass == UNICODE_LB_NU) &&
+ uclass == UNICODE_LB_OP)
+ return RESULT(UNICODE_LB_NONE); /* LB30 */
+
+ if ((uclass == UNICODE_LB_AL || uclass == UNICODE_LB_NU) &&
+ prevclass == UNICODE_LB_CP)
+ return RESULT(UNICODE_LB_NONE); /* LB30 */
+
+ return RESULT(UNICODE_LB_ALLOWED); /* LB31 */
+}
+
+/*
+** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
+** character, but NU did not follow. Backtrack.
+*/
+
+static int unwind_lb25_seenophy(unicode_lb_info_t i)
+{
+ int rc;
+
+ /*uint8_t class=i->savedclass;*/
+ int nolb25_flag=1;
+
+ i->next_handler=next_def;
+ i->end_handler=end_def;
+
+ do
+ {
+ rc=next_def_nolb25(i, i->savedclass, nolb25_flag);
+
+ if (rc)
+ return rc;
+
+ /*class=UNICODE_LB_CM;*/
+ nolb25_flag=0;
+ } while (i->savedcmcnt--);
+ return 0;
+}
+
+/*
+** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
+** character. If there's now a NU, we found the modified LB25 regexp.
+*/
+
+static int next_lb25_seenophy(unicode_lb_info_t i,
+ uint8_t uclass)
+{
+ int rc;
+
+ if (uclass == UNICODE_LB_CM)
+ {
+ ++i->savedcmcnt; /* Keep track of CMs, and try again */
+ return 0;
+ }
+
+ if (uclass != UNICODE_LB_NU)
+ {
+ rc=unwind_lb25_seenophy(i);
+
+ if (rc)
+ return rc;
+
+ return next_def_nolb25(i, uclass, 0);
+ }
+
+ do
+ {
+ rc=RESULT(UNICODE_LB_NONE); /* (OP|HY) feedback */
+
+ if (rc)
+ return rc;
+ } while (i->savedcmcnt--);
+
+ i->next_handler=next_lb25_seennu;
+ i->end_handler=end_def;
+ i->prevclass=i->prevclass_nsp=uclass;
+ return RESULT(UNICODE_LB_NONE);
+}
+
+/*
+** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up.
+*/
+
+static int end_lb25_seenophy(unicode_lb_info_t i)
+{
+ int rc=unwind_lb25_seenophy(i);
+
+ if (rc == 0)
+ rc=end_def(i);
+ return rc;
+}
+
+/*
+** Seen an NU, modified LB25 regexp.
+*/
+static int next_lb25_seennu(unicode_lb_info_t i, uint8_t uclass)
+{
+ if (uclass == UNICODE_LB_NU || uclass == UNICODE_LB_SY ||
+ uclass == UNICODE_LB_IS)
+ {
+ i->prevclass=i->prevclass_nsp=uclass;
+ return RESULT(UNICODE_LB_NONE);
+ }
+
+ if (uclass == UNICODE_LB_CM)
+ return RESULT(UNICODE_LB_NONE); /* LB9 */
+
+ if (uclass == UNICODE_LB_CL || uclass == UNICODE_LB_CP)
+ {
+ i->prevclass=i->prevclass_nsp=uclass;
+ i->next_handler=next_lb25_seennuclcp;
+ i->end_handler=end_def;
+ return RESULT(UNICODE_LB_NONE);
+ }
+
+ i->next_handler=next_def;
+ i->end_handler=end_def;
+
+ if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
+ {
+ i->prevclass=i->prevclass_nsp=uclass;
+ return RESULT(UNICODE_LB_NONE);
+ }
+
+ return next_def(i, uclass); /* Not a prefix, process normally */
+}
+
+/*
+** Seen CL|CP, in the modified LB25 regexp.
+*/
+static int next_lb25_seennuclcp(unicode_lb_info_t i, uint8_t uclass)
+{
+ if (uclass == UNICODE_LB_CM)
+ return RESULT(UNICODE_LB_NONE); /* LB9 */
+
+ i->next_handler=next_def;
+ i->end_handler=end_def;
+
+ if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
+ {
+ i->prevclass=i->prevclass_nsp=uclass;
+
+ return RESULT(UNICODE_LB_NONE);
+ }
+
+ return next_def(i, uclass);
+}
+
+/******************/
+
+struct unicode_lbc_info {
+ unicode_lb_info_t handle;
+
+ struct unicode_buf buf;
+
+ size_t buf_ptr;
+
+ int (*cb_func)(int, unicode_char, void *);
+ void *cb_arg;
+};
+
+static int unicode_lbc_callback(int value, void *ptr)
+{
+ unicode_lbc_info_t h=(unicode_lbc_info_t)ptr;
+
+ if (h->buf_ptr >= unicode_buf_len(&h->buf))
+ {
+ errno=EINVAL;
+ return -1; /* Shouldn't happen */
+ }
+
+ return (*h->cb_func)(value, unicode_buf_ptr(&h->buf)[h->buf_ptr++],
+ h->cb_arg);
+}
+
+unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, void *),
+ void *cb_arg)
+{
+ unicode_lbc_info_t h=
+ (unicode_lbc_info_t)calloc(1, sizeof(struct unicode_lbc_info));
+
+ if (!h)
+ return NULL;
+
+ h->cb_func=cb_func;
+ h->cb_arg=cb_arg;
+
+ if ((h->handle=unicode_lb_init(unicode_lbc_callback, h)) == NULL)
+ {
+ free(h);
+ return NULL;
+ }
+ unicode_buf_init(&h->buf, (size_t)-1);
+ return h;
+}
+
+void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts)
+{
+ unicode_lb_set_opts(i->handle, opts);
+}
+
+int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch)
+{
+ if (i->buf_ptr >= unicode_buf_len(&i->buf))
+ {
+ i->buf_ptr=0;
+ unicode_buf_clear(&i->buf);
+ }
+
+ unicode_buf_append(&i->buf, &ch, 1);
+ return unicode_lb_next(i->handle, ch);
+}
+
+int unicode_lbc_end(unicode_lbc_info_t i)
+{
+ int rc=unicode_lb_end(i->handle);
+
+ unicode_buf_deinit(&i->buf);
+ free(i);
+ return rc;
+}