+++ /dev/null
-/*
-** Copyright 2011 Double Precision, Inc.
-** See COPYING for distribution information.
-**
-*/
-
-#include "unicode_config.h"
-#include "unicode.h"
-
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <errno.h>
-
-#include "linebreaktab_internal.h"
-
-#include "linebreaktab.h"
-
-#define UNICODE_LB_SOT 0xFF
-
-struct unicode_lb_info {
- int (*cb_func)(int, void *);
- void *cb_arg;
-
- int opts;
-
- uint8_t savedclass;
- size_t savedcmcnt;
-
- uint8_t prevclass;
- uint8_t prevclass_nsp;
-
- int (*next_handler)(struct unicode_lb_info *, uint8_t);
- int (*end_handler)(struct unicode_lb_info *);
-};
-
-
-/* http://www.unicode.org/reports/tr14/#Algorithm */
-
-static int next_def(unicode_lb_info_t, uint8_t);
-static int end_def(unicode_lb_info_t);
-
-static int next_lb25_seenophy(unicode_lb_info_t, uint8_t);
-static int end_lb25_seenophy(unicode_lb_info_t);
-
-static int next_lb25_seennu(unicode_lb_info_t, uint8_t);
-
-static int next_lb25_seennuclcp(unicode_lb_info_t, uint8_t);
-
-static void unicode_lb_reset(unicode_lb_info_t i)
-{
- i->prevclass=i->prevclass_nsp=UNICODE_LB_SOT;
- i->next_handler=next_def;
- i->end_handler=end_def;
-}
-
-unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
- void *cb_arg)
-{
- unicode_lb_info_t i=calloc(1, sizeof(struct unicode_lb_info));
-
- i->cb_func=cb_func;
- i->cb_arg=cb_arg;
-
- unicode_lb_reset(i);
- return i;
-}
-
-int unicode_lb_end(unicode_lb_info_t i)
-{
- int rc=(*i->end_handler)(i);
-
- free(i);
- return rc;
-}
-
-void unicode_lb_set_opts(unicode_lb_info_t i, int opts)
-{
- i->opts=opts;
-}
-
-/* Default end handler has nothing to do */
-
-static int end_def(unicode_lb_info_t i)
-{
- /* LB3 N/A */
- return 0;
-}
-#define RESULT(x) (*i->cb_func)((x), i->cb_arg)
-
-int unicode_lb_next_cnt(unicode_lb_info_t i,
- const unicode_char *chars,
- size_t cnt)
-{
- while (cnt)
- {
- int rc=unicode_lb_next(i, *chars);
-
- if (rc)
- return rc;
-
- ++chars;
- --cnt;
- }
- return 0;
-}
-
-int unicode_lb_lookup(unicode_char ch)
-{
- return unicode_tab_lookup(ch,
- unicode_indextab,
- sizeof(unicode_indextab)
- / sizeof(unicode_indextab[0]),
- unicode_rangetab,
- unicode_classtab,
- UNICODE_LB_AL /* XX, LB1 */);
-}
-
-int unicode_lb_next(unicode_lb_info_t i,
- unicode_char ch)
-{
- return (*i->next_handler)(i, (i->opts & UNICODE_LB_OPT_DASHWJ) &&
- (ch == 0x2012 || ch == 0x2013)
- ? UNICODE_LB_WJ:unicode_lb_lookup(ch));
-}
-
-static int next_def_nolb25(unicode_lb_info_t i,
- uint8_t uclass,
- int nolb25);
-
-/*
-** Default logic for next unicode char.
-*/
-static int next_def(unicode_lb_info_t i,
- uint8_t uclass)
-{
- return next_def_nolb25(i, uclass, 0);
-}
-
-static int next_def_nolb25(unicode_lb_info_t i,
- uint8_t uclass,
-
- /* Flag -- recursively invoked after discarding LB25 */
- int nolb25)
-{
-
- /* Retrieve the previous unicode character's linebreak class. */
-
- uint8_t prevclass=i->prevclass;
- uint8_t prevclass_nsp=i->prevclass_nsp;
-
- /* Save this unicode char's linebreak class, for the next goaround */
- i->prevclass=uclass;
-
- if (uclass != UNICODE_LB_SP)
- i->prevclass_nsp=uclass;
-
- if (uclass == UNICODE_LB_NU)
- i->next_handler=next_lb25_seennu; /* LB25 */
-
- if (prevclass == UNICODE_LB_SOT)
- {
- if (uclass == UNICODE_LB_CM) /* LB9 */
- i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
-
- return RESULT(UNICODE_LB_NONE); /* LB2 */
- }
-
- if (prevclass == UNICODE_LB_CR && uclass == UNICODE_LB_LF)
- return RESULT(UNICODE_LB_NONE); /* LB5 */
-
- switch (prevclass) {
- case UNICODE_LB_BK:
- case UNICODE_LB_CR:
- case UNICODE_LB_LF:
- case UNICODE_LB_NL:
-
- if (uclass == UNICODE_LB_CM)
- {
- i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
- /* LB9 */
- }
-
- return RESULT(UNICODE_LB_MANDATORY); /* LB4, LB5 */
-
- case UNICODE_LB_SP:
- case UNICODE_LB_ZW:
- if (uclass == UNICODE_LB_CM)
- i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
- /* LB10 */
- break;
- default:
- break;
- }
-
- switch (uclass) {
-
- /* LB6: */
- case UNICODE_LB_BK:
- case UNICODE_LB_CR:
- case UNICODE_LB_LF:
- case UNICODE_LB_NL:
-
- /* LB7: */
- case UNICODE_LB_SP:
- case UNICODE_LB_ZW:
-
- return RESULT(UNICODE_LB_NONE);
- default:
- break;
- }
-
- if (prevclass_nsp == UNICODE_LB_ZW)
- return RESULT(UNICODE_LB_ALLOWED); /* LB8 */
-
- if (uclass == UNICODE_LB_CM)
- {
- i->prevclass=prevclass;
- i->prevclass_nsp=prevclass_nsp;
- return RESULT(UNICODE_LB_NONE); /* LB9 */
- }
-
- if (prevclass == UNICODE_LB_WJ || uclass == UNICODE_LB_WJ)
- return RESULT(UNICODE_LB_NONE); /* LB11 */
-
- if (prevclass == UNICODE_LB_GL)
- return RESULT(UNICODE_LB_NONE); /* LB12 */
-
- if (uclass == UNICODE_LB_GL &&
- prevclass != UNICODE_LB_SP &&
- prevclass != UNICODE_LB_BA &&
- prevclass != UNICODE_LB_HY)
- return RESULT(UNICODE_LB_NONE); /* LB12a */
-
-
- switch (uclass) {
- case UNICODE_LB_SY:
- if (i->opts & UNICODE_LB_OPT_SYBREAK)
- {
- if (prevclass == UNICODE_LB_SP)
- return RESULT(UNICODE_LB_ALLOWED);
- }
-
- case UNICODE_LB_CL:
- case UNICODE_LB_CP:
- case UNICODE_LB_EX:
- case UNICODE_LB_IS:
- return RESULT(UNICODE_LB_NONE); /* LB13 */
- default:
- break;
- }
-
- if ((i->opts & UNICODE_LB_OPT_SYBREAK) && prevclass == UNICODE_LB_SY)
- switch (uclass) {
- case UNICODE_LB_EX:
- case UNICODE_LB_AL:
- case UNICODE_LB_ID:
- return RESULT(UNICODE_LB_NONE);
- }
-
- if (prevclass_nsp == UNICODE_LB_OP)
- return RESULT(UNICODE_LB_NONE); /* LB14 */
-
- if (prevclass_nsp == UNICODE_LB_QU && uclass == UNICODE_LB_OP)
- return RESULT(UNICODE_LB_NONE); /* LB15 */
-
- if ((prevclass_nsp == UNICODE_LB_CL || prevclass_nsp == UNICODE_LB_CP)
- && uclass == UNICODE_LB_NS)
- return RESULT(UNICODE_LB_NONE); /* LB16 */
-
- if (prevclass_nsp == UNICODE_LB_B2 && uclass == UNICODE_LB_B2)
- return RESULT(UNICODE_LB_NONE); /* LB17 */
-
- if (prevclass == UNICODE_LB_SP)
- return RESULT(UNICODE_LB_ALLOWED); /* LB18 */
-
- if (uclass == UNICODE_LB_QU || prevclass == UNICODE_LB_QU)
- return RESULT(UNICODE_LB_NONE); /* LB19 */
-
- if (uclass == UNICODE_LB_CB || prevclass == UNICODE_LB_CB)
- return RESULT(UNICODE_LB_ALLOWED); /* LB20 */
-
- /* LB21: */
-
- switch (uclass) {
- case UNICODE_LB_BA:
- case UNICODE_LB_HY:
- case UNICODE_LB_NS:
- return RESULT(UNICODE_LB_NONE);
- default:
- break;
- }
-
- if (prevclass == UNICODE_LB_BB)
- return RESULT(UNICODE_LB_NONE);
-
- if (uclass == UNICODE_LB_IN)
- switch (prevclass) {
- case UNICODE_LB_AL:
- case UNICODE_LB_ID:
- case UNICODE_LB_IN:
- case UNICODE_LB_NU:
- return RESULT(UNICODE_LB_NONE); /* LB22 */
- default:
- break;
- }
-
-
- if (prevclass == UNICODE_LB_ID && uclass == UNICODE_LB_PO)
- return RESULT(UNICODE_LB_NONE); /* LB23 */
- if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_NU)
- return RESULT(UNICODE_LB_NONE); /* LB23 */
-
- if (prevclass == UNICODE_LB_NU && uclass == UNICODE_LB_AL)
- return RESULT(UNICODE_LB_NONE); /* LB23 */
-
-
- if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_ID)
- return RESULT(UNICODE_LB_NONE); /* LB24 */
- if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_AL)
- return RESULT(UNICODE_LB_NONE); /* LB24 */
- if (prevclass == UNICODE_LB_PO && uclass == UNICODE_LB_AL)
- return RESULT(UNICODE_LB_NONE); /* LB24 */
-
- if ((i->opts & UNICODE_LB_OPT_PRBREAK) && uclass == UNICODE_LB_PR)
- switch (prevclass) {
- case UNICODE_LB_PR:
- case UNICODE_LB_AL:
- case UNICODE_LB_ID:
- return RESULT(UNICODE_LB_NONE);
- }
-
- if (!nolb25 &&
- (prevclass == UNICODE_LB_PR || prevclass == UNICODE_LB_PO))
- {
- if (uclass == UNICODE_LB_NU)
- return RESULT(UNICODE_LB_NONE); /* LB25 */
-
- if (uclass == UNICODE_LB_OP || uclass == UNICODE_LB_HY)
- {
- i->prevclass=prevclass;
- i->prevclass_nsp=prevclass_nsp;
-
- i->savedclass=uclass;
- i->savedcmcnt=0;
- i->next_handler=next_lb25_seenophy;
- i->end_handler=end_lb25_seenophy;
- return 0;
- }
- }
-
- if ((prevclass == UNICODE_LB_OP || prevclass == UNICODE_LB_HY) &&
- uclass == UNICODE_LB_NU)
- return RESULT(UNICODE_LB_NONE); /* LB25 */
-
- /*****/
-
- if (prevclass == UNICODE_LB_JL)
- switch (uclass) {
- case UNICODE_LB_JL:
- case UNICODE_LB_JV:
- case UNICODE_LB_H2:
- case UNICODE_LB_H3:
- return RESULT(UNICODE_LB_NONE); /* LB26 */
- default:
- break;
- }
-
- if ((prevclass == UNICODE_LB_JV ||
- prevclass == UNICODE_LB_H2) &&
- (uclass == UNICODE_LB_JV ||
- uclass == UNICODE_LB_JT))
- return RESULT(UNICODE_LB_NONE); /* LB26 */
-
- if ((prevclass == UNICODE_LB_JT ||
- prevclass == UNICODE_LB_H3) &&
- uclass == UNICODE_LB_JT)
- return RESULT(UNICODE_LB_NONE); /* LB26 */
-
-
- switch (prevclass) {
- case UNICODE_LB_JL:
- case UNICODE_LB_JV:
- case UNICODE_LB_JT:
- case UNICODE_LB_H2:
- case UNICODE_LB_H3:
- if (uclass == UNICODE_LB_IN || uclass == UNICODE_LB_PO)
- return RESULT(UNICODE_LB_NONE); /* LB27 */
- default:
- break;
- }
-
- switch (uclass) {
- case UNICODE_LB_JL:
- case UNICODE_LB_JV:
- case UNICODE_LB_JT:
- case UNICODE_LB_H2:
- case UNICODE_LB_H3:
- if (prevclass == UNICODE_LB_PR)
- return RESULT(UNICODE_LB_NONE); /* LB27 */
- default:
- break;
- }
-
- if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_AL)
- return RESULT(UNICODE_LB_NONE); /* LB28 */
-
- if (prevclass == UNICODE_LB_IS && uclass == UNICODE_LB_AL)
- return RESULT(UNICODE_LB_NONE); /* LB29 */
-
- if ((prevclass == UNICODE_LB_AL || prevclass == UNICODE_LB_NU) &&
- uclass == UNICODE_LB_OP)
- return RESULT(UNICODE_LB_NONE); /* LB30 */
-
- if ((uclass == UNICODE_LB_AL || uclass == UNICODE_LB_NU) &&
- prevclass == UNICODE_LB_CP)
- return RESULT(UNICODE_LB_NONE); /* LB30 */
-
- return RESULT(UNICODE_LB_ALLOWED); /* LB31 */
-}
-
-/*
-** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
-** character, but NU did not follow. Backtrack.
-*/
-
-static int unwind_lb25_seenophy(unicode_lb_info_t i)
-{
- int rc;
-
- /*uint8_t class=i->savedclass;*/
- int nolb25_flag=1;
-
- i->next_handler=next_def;
- i->end_handler=end_def;
-
- do
- {
- rc=next_def_nolb25(i, i->savedclass, nolb25_flag);
-
- if (rc)
- return rc;
-
- /*class=UNICODE_LB_CM;*/
- nolb25_flag=0;
- } while (i->savedcmcnt--);
- return 0;
-}
-
-/*
-** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
-** character. If there's now a NU, we found the modified LB25 regexp.
-*/
-
-static int next_lb25_seenophy(unicode_lb_info_t i,
- uint8_t uclass)
-{
- int rc;
-
- if (uclass == UNICODE_LB_CM)
- {
- ++i->savedcmcnt; /* Keep track of CMs, and try again */
- return 0;
- }
-
- if (uclass != UNICODE_LB_NU)
- {
- rc=unwind_lb25_seenophy(i);
-
- if (rc)
- return rc;
-
- return next_def_nolb25(i, uclass, 0);
- }
-
- do
- {
- rc=RESULT(UNICODE_LB_NONE); /* (OP|HY) feedback */
-
- if (rc)
- return rc;
- } while (i->savedcmcnt--);
-
- i->next_handler=next_lb25_seennu;
- i->end_handler=end_def;
- i->prevclass=i->prevclass_nsp=uclass;
- return RESULT(UNICODE_LB_NONE);
-}
-
-/*
-** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up.
-*/
-
-static int end_lb25_seenophy(unicode_lb_info_t i)
-{
- int rc=unwind_lb25_seenophy(i);
-
- if (rc == 0)
- rc=end_def(i);
- return rc;
-}
-
-/*
-** Seen an NU, modified LB25 regexp.
-*/
-static int next_lb25_seennu(unicode_lb_info_t i, uint8_t uclass)
-{
- if (uclass == UNICODE_LB_NU || uclass == UNICODE_LB_SY ||
- uclass == UNICODE_LB_IS)
- {
- i->prevclass=i->prevclass_nsp=uclass;
- return RESULT(UNICODE_LB_NONE);
- }
-
- if (uclass == UNICODE_LB_CM)
- return RESULT(UNICODE_LB_NONE); /* LB9 */
-
- if (uclass == UNICODE_LB_CL || uclass == UNICODE_LB_CP)
- {
- i->prevclass=i->prevclass_nsp=uclass;
- i->next_handler=next_lb25_seennuclcp;
- i->end_handler=end_def;
- return RESULT(UNICODE_LB_NONE);
- }
-
- i->next_handler=next_def;
- i->end_handler=end_def;
-
- if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
- {
- i->prevclass=i->prevclass_nsp=uclass;
- return RESULT(UNICODE_LB_NONE);
- }
-
- return next_def(i, uclass); /* Not a prefix, process normally */
-}
-
-/*
-** Seen CL|CP, in the modified LB25 regexp.
-*/
-static int next_lb25_seennuclcp(unicode_lb_info_t i, uint8_t uclass)
-{
- if (uclass == UNICODE_LB_CM)
- return RESULT(UNICODE_LB_NONE); /* LB9 */
-
- i->next_handler=next_def;
- i->end_handler=end_def;
-
- if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
- {
- i->prevclass=i->prevclass_nsp=uclass;
-
- return RESULT(UNICODE_LB_NONE);
- }
-
- return next_def(i, uclass);
-}
-
-/******************/
-
-struct unicode_lbc_info {
- unicode_lb_info_t handle;
-
- struct unicode_buf buf;
-
- size_t buf_ptr;
-
- int (*cb_func)(int, unicode_char, void *);
- void *cb_arg;
-};
-
-static int unicode_lbc_callback(int value, void *ptr)
-{
- unicode_lbc_info_t h=(unicode_lbc_info_t)ptr;
-
- if (h->buf_ptr >= unicode_buf_len(&h->buf))
- {
- errno=EINVAL;
- return -1; /* Shouldn't happen */
- }
-
- return (*h->cb_func)(value, unicode_buf_ptr(&h->buf)[h->buf_ptr++],
- h->cb_arg);
-}
-
-unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, void *),
- void *cb_arg)
-{
- unicode_lbc_info_t h=
- (unicode_lbc_info_t)calloc(1, sizeof(struct unicode_lbc_info));
-
- if (!h)
- return NULL;
-
- h->cb_func=cb_func;
- h->cb_arg=cb_arg;
-
- if ((h->handle=unicode_lb_init(unicode_lbc_callback, h)) == NULL)
- {
- free(h);
- return NULL;
- }
- unicode_buf_init(&h->buf, (size_t)-1);
- return h;
-}
-
-void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts)
-{
- unicode_lb_set_opts(i->handle, opts);
-}
-
-int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch)
-{
- if (i->buf_ptr >= unicode_buf_len(&i->buf))
- {
- i->buf_ptr=0;
- unicode_buf_clear(&i->buf);
- }
-
- unicode_buf_append(&i->buf, &ch, 1);
- return unicode_lb_next(i->handle, ch);
-}
-
-int unicode_lbc_end(unicode_lbc_info_t i)
-{
- int rc=unicode_lb_end(i->handle);
-
- unicode_buf_deinit(&i->buf);
- free(i);
- return rc;
-}