Merge branch 'debian'
[hcoop/debian/courier-authlib.git] / libs / unicode / unicode_linebreak.c
diff --git a/libs/unicode/unicode_linebreak.c b/libs/unicode/unicode_linebreak.c
new file mode 100644 (file)
index 0000000..1105dec
--- /dev/null
@@ -0,0 +1,632 @@
+/*
+** Copyright 2011 Double Precision, Inc.
+** See COPYING for distribution information.
+**
+*/
+
+#include       "unicode_config.h"
+#include       "unicode.h"
+
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include "linebreaktab_internal.h"
+
+#include "linebreaktab.h"
+
+#define UNICODE_LB_SOT 0xFF
+
+struct unicode_lb_info {
+       int (*cb_func)(int, void *);
+       void *cb_arg;
+
+       int opts;
+
+       uint8_t savedclass;
+       size_t savedcmcnt;
+
+       uint8_t prevclass;
+       uint8_t prevclass_nsp;
+
+       int (*next_handler)(struct unicode_lb_info *, uint8_t);
+       int (*end_handler)(struct unicode_lb_info *);
+};
+
+
+/* http://www.unicode.org/reports/tr14/#Algorithm */
+
+static int next_def(unicode_lb_info_t, uint8_t);
+static int end_def(unicode_lb_info_t);
+
+static int next_lb25_seenophy(unicode_lb_info_t, uint8_t);
+static int end_lb25_seenophy(unicode_lb_info_t);
+
+static int next_lb25_seennu(unicode_lb_info_t, uint8_t);
+
+static int next_lb25_seennuclcp(unicode_lb_info_t, uint8_t);
+
+static void unicode_lb_reset(unicode_lb_info_t i)
+{
+       i->prevclass=i->prevclass_nsp=UNICODE_LB_SOT;
+       i->next_handler=next_def;
+       i->end_handler=end_def;
+}
+
+unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
+                                 void *cb_arg)
+{
+       unicode_lb_info_t i=calloc(1, sizeof(struct unicode_lb_info));
+
+       i->cb_func=cb_func;
+       i->cb_arg=cb_arg;
+
+       unicode_lb_reset(i);
+       return i;
+}
+
+int unicode_lb_end(unicode_lb_info_t i)
+{
+       int rc=(*i->end_handler)(i);
+
+       free(i);
+       return rc;
+}
+
+void unicode_lb_set_opts(unicode_lb_info_t i, int opts)
+{
+       i->opts=opts;
+}
+
+/* Default end handler has nothing to do */
+
+static int end_def(unicode_lb_info_t i)
+{
+       /* LB3 N/A */
+       return 0;
+}
+#define RESULT(x) (*i->cb_func)((x), i->cb_arg)
+
+int unicode_lb_next_cnt(unicode_lb_info_t i,
+                       const unicode_char *chars,
+                       size_t cnt)
+{
+       while (cnt)
+       {
+               int rc=unicode_lb_next(i, *chars);
+
+               if (rc)
+                       return rc;
+
+               ++chars;
+               --cnt;
+       }
+       return 0;
+}
+
+int unicode_lb_lookup(unicode_char ch)
+{
+       return unicode_tab_lookup(ch,
+                                 unicode_indextab,
+                                 sizeof(unicode_indextab)
+                                 / sizeof(unicode_indextab[0]),
+                                 unicode_rangetab,
+                                 unicode_classtab,
+                                 UNICODE_LB_AL /* XX, LB1 */);
+}
+
+int unicode_lb_next(unicode_lb_info_t i,
+                   unicode_char ch)
+{
+       return (*i->next_handler)(i, (i->opts & UNICODE_LB_OPT_DASHWJ) &&
+                                 (ch == 0x2012 || ch == 0x2013)
+                                 ? UNICODE_LB_WJ:unicode_lb_lookup(ch));
+}
+
+static int next_def_nolb25(unicode_lb_info_t i,
+                          uint8_t uclass,
+                          int nolb25);
+
+/*
+** Default logic for next unicode char.
+*/
+static int next_def(unicode_lb_info_t i,
+                   uint8_t uclass)
+{
+       return next_def_nolb25(i, uclass, 0);
+}
+
+static int next_def_nolb25(unicode_lb_info_t i,
+                          uint8_t uclass,
+
+                          /* Flag -- recursively invoked after discarding LB25 */
+                          int nolb25)
+{
+
+       /* Retrieve the previous unicode character's linebreak class. */
+
+       uint8_t prevclass=i->prevclass;
+       uint8_t prevclass_nsp=i->prevclass_nsp;
+
+       /* Save this unicode char's linebreak class, for the next goaround */
+       i->prevclass=uclass;
+
+       if (uclass != UNICODE_LB_SP)
+               i->prevclass_nsp=uclass;
+
+       if (uclass == UNICODE_LB_NU)
+               i->next_handler=next_lb25_seennu; /* LB25 */
+
+       if (prevclass == UNICODE_LB_SOT)
+       {
+               if (uclass == UNICODE_LB_CM) /* LB9 */
+                       i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
+
+               return RESULT(UNICODE_LB_NONE); /* LB2 */
+       }
+
+       if (prevclass == UNICODE_LB_CR && uclass == UNICODE_LB_LF)
+               return RESULT(UNICODE_LB_NONE); /* LB5 */
+
+       switch (prevclass) {
+       case UNICODE_LB_BK:
+       case UNICODE_LB_CR:
+       case UNICODE_LB_LF:
+       case UNICODE_LB_NL:
+
+               if (uclass == UNICODE_LB_CM)
+               {
+                       i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
+                       /* LB9 */
+               }
+
+               return RESULT(UNICODE_LB_MANDATORY); /* LB4, LB5 */
+
+       case UNICODE_LB_SP:
+       case UNICODE_LB_ZW:
+               if (uclass == UNICODE_LB_CM)
+                       i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
+               /* LB10 */
+               break;
+       default:
+               break;
+       }
+
+       switch (uclass) {
+
+               /* LB6: */
+       case UNICODE_LB_BK:
+       case UNICODE_LB_CR:
+       case UNICODE_LB_LF:
+       case UNICODE_LB_NL:
+
+               /* LB7: */
+       case UNICODE_LB_SP:
+       case UNICODE_LB_ZW:
+
+               return RESULT(UNICODE_LB_NONE);
+       default:
+               break;
+       }
+
+       if (prevclass_nsp == UNICODE_LB_ZW)
+               return RESULT(UNICODE_LB_ALLOWED); /* LB8 */
+
+       if (uclass == UNICODE_LB_CM)
+       {
+               i->prevclass=prevclass;
+               i->prevclass_nsp=prevclass_nsp;
+               return RESULT(UNICODE_LB_NONE); /* LB9 */
+       }
+
+       if (prevclass == UNICODE_LB_WJ || uclass == UNICODE_LB_WJ)
+               return RESULT(UNICODE_LB_NONE); /* LB11 */
+
+       if (prevclass == UNICODE_LB_GL)
+               return RESULT(UNICODE_LB_NONE); /* LB12 */
+
+       if (uclass == UNICODE_LB_GL &&
+           prevclass != UNICODE_LB_SP &&
+           prevclass != UNICODE_LB_BA &&
+           prevclass != UNICODE_LB_HY)
+               return RESULT(UNICODE_LB_NONE); /* LB12a */
+
+
+       switch (uclass) {
+       case UNICODE_LB_SY:
+               if (i->opts & UNICODE_LB_OPT_SYBREAK)
+               {
+                       if (prevclass == UNICODE_LB_SP)
+                               return RESULT(UNICODE_LB_ALLOWED);
+               }
+
+       case UNICODE_LB_CL:
+       case UNICODE_LB_CP:
+       case UNICODE_LB_EX:
+       case UNICODE_LB_IS:
+               return RESULT(UNICODE_LB_NONE); /* LB13 */
+       default:
+               break;
+       }
+
+       if ((i->opts & UNICODE_LB_OPT_SYBREAK) && prevclass == UNICODE_LB_SY)
+               switch (uclass) {
+               case UNICODE_LB_EX:
+               case UNICODE_LB_AL:
+               case UNICODE_LB_ID:
+                       return RESULT(UNICODE_LB_NONE);
+               }
+
+       if (prevclass_nsp == UNICODE_LB_OP)
+               return RESULT(UNICODE_LB_NONE); /* LB14 */
+
+       if (prevclass_nsp == UNICODE_LB_QU && uclass == UNICODE_LB_OP)
+               return RESULT(UNICODE_LB_NONE); /* LB15 */
+
+       if ((prevclass_nsp == UNICODE_LB_CL || prevclass_nsp == UNICODE_LB_CP)
+           && uclass == UNICODE_LB_NS)
+               return RESULT(UNICODE_LB_NONE); /* LB16 */
+
+       if (prevclass_nsp == UNICODE_LB_B2 && uclass == UNICODE_LB_B2)
+               return RESULT(UNICODE_LB_NONE); /* LB17 */
+
+       if (prevclass == UNICODE_LB_SP)
+               return RESULT(UNICODE_LB_ALLOWED); /* LB18 */
+
+       if (uclass == UNICODE_LB_QU || prevclass == UNICODE_LB_QU)
+               return RESULT(UNICODE_LB_NONE); /* LB19 */
+
+       if (uclass == UNICODE_LB_CB || prevclass == UNICODE_LB_CB)
+               return RESULT(UNICODE_LB_ALLOWED); /* LB20 */
+
+       /* LB21: */
+
+       switch (uclass) {
+       case UNICODE_LB_BA:
+       case UNICODE_LB_HY:
+       case UNICODE_LB_NS:
+               return RESULT(UNICODE_LB_NONE);
+       default:
+               break;
+       }
+
+       if (prevclass == UNICODE_LB_BB)
+               return RESULT(UNICODE_LB_NONE);
+
+       if (uclass == UNICODE_LB_IN)
+               switch (prevclass) {
+               case UNICODE_LB_AL:
+               case UNICODE_LB_ID:
+               case UNICODE_LB_IN:
+               case UNICODE_LB_NU:
+                       return RESULT(UNICODE_LB_NONE); /* LB22 */
+               default:
+                       break;
+               }
+
+
+       if (prevclass == UNICODE_LB_ID && uclass == UNICODE_LB_PO)
+               return RESULT(UNICODE_LB_NONE); /* LB23 */
+       if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_NU)
+               return RESULT(UNICODE_LB_NONE); /* LB23 */
+
+       if (prevclass == UNICODE_LB_NU && uclass == UNICODE_LB_AL)
+               return RESULT(UNICODE_LB_NONE); /* LB23 */
+
+
+       if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_ID)
+               return RESULT(UNICODE_LB_NONE); /* LB24 */
+       if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_AL)
+               return RESULT(UNICODE_LB_NONE); /* LB24 */
+       if (prevclass == UNICODE_LB_PO && uclass == UNICODE_LB_AL)
+               return RESULT(UNICODE_LB_NONE); /* LB24 */
+
+       if ((i->opts & UNICODE_LB_OPT_PRBREAK) && uclass == UNICODE_LB_PR)
+               switch (prevclass) {
+               case UNICODE_LB_PR:
+               case UNICODE_LB_AL:
+               case UNICODE_LB_ID:
+                       return RESULT(UNICODE_LB_NONE);
+               }
+               
+       if (!nolb25 &&
+           (prevclass == UNICODE_LB_PR || prevclass == UNICODE_LB_PO))
+       {
+               if (uclass == UNICODE_LB_NU)
+                       return RESULT(UNICODE_LB_NONE); /* LB25 */
+
+               if (uclass == UNICODE_LB_OP || uclass == UNICODE_LB_HY)
+               {
+                       i->prevclass=prevclass;
+                       i->prevclass_nsp=prevclass_nsp;
+
+                       i->savedclass=uclass;
+                       i->savedcmcnt=0;
+                       i->next_handler=next_lb25_seenophy;
+                       i->end_handler=end_lb25_seenophy;
+                       return 0;
+               }
+       }
+
+       if ((prevclass == UNICODE_LB_OP || prevclass == UNICODE_LB_HY) &&
+           uclass == UNICODE_LB_NU)
+               return RESULT(UNICODE_LB_NONE); /* LB25 */
+
+       /*****/
+
+       if (prevclass == UNICODE_LB_JL)
+               switch (uclass) {
+               case UNICODE_LB_JL:
+               case UNICODE_LB_JV:
+               case UNICODE_LB_H2:
+               case UNICODE_LB_H3:
+                       return RESULT(UNICODE_LB_NONE); /* LB26 */
+               default:
+                       break;
+               }
+
+       if ((prevclass == UNICODE_LB_JV ||
+            prevclass == UNICODE_LB_H2) &&
+           (uclass == UNICODE_LB_JV ||
+            uclass == UNICODE_LB_JT))
+               return RESULT(UNICODE_LB_NONE); /* LB26 */
+
+       if ((prevclass == UNICODE_LB_JT ||
+            prevclass == UNICODE_LB_H3) &&
+           uclass == UNICODE_LB_JT)
+               return RESULT(UNICODE_LB_NONE); /* LB26 */
+
+
+       switch (prevclass) {
+       case UNICODE_LB_JL:
+       case UNICODE_LB_JV:
+       case UNICODE_LB_JT:
+       case UNICODE_LB_H2:
+       case UNICODE_LB_H3:
+               if (uclass == UNICODE_LB_IN || uclass == UNICODE_LB_PO)
+                       return RESULT(UNICODE_LB_NONE); /* LB27 */
+       default:
+               break;
+       }
+
+       switch (uclass) {
+       case UNICODE_LB_JL:
+       case UNICODE_LB_JV:
+       case UNICODE_LB_JT:
+       case UNICODE_LB_H2:
+       case UNICODE_LB_H3:
+               if (prevclass == UNICODE_LB_PR)
+                       return RESULT(UNICODE_LB_NONE); /* LB27 */
+       default:
+               break;
+       }
+
+       if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_AL)
+               return RESULT(UNICODE_LB_NONE); /* LB28 */
+
+       if (prevclass == UNICODE_LB_IS && uclass == UNICODE_LB_AL)
+               return RESULT(UNICODE_LB_NONE); /* LB29 */
+
+       if ((prevclass == UNICODE_LB_AL || prevclass == UNICODE_LB_NU) &&
+           uclass == UNICODE_LB_OP)
+               return RESULT(UNICODE_LB_NONE); /* LB30 */
+
+       if ((uclass == UNICODE_LB_AL || uclass == UNICODE_LB_NU) &&
+           prevclass == UNICODE_LB_CP)
+               return RESULT(UNICODE_LB_NONE); /* LB30 */
+
+       return RESULT(UNICODE_LB_ALLOWED); /* LB31 */
+}
+
+/*
+** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
+** character, but NU did not follow. Backtrack.
+*/
+
+static int unwind_lb25_seenophy(unicode_lb_info_t i)
+{
+       int rc;
+
+       /*uint8_t class=i->savedclass;*/
+       int nolb25_flag=1;
+
+       i->next_handler=next_def;
+       i->end_handler=end_def;
+
+       do
+       {
+               rc=next_def_nolb25(i, i->savedclass, nolb25_flag);
+
+               if (rc)
+                       return rc;
+
+               /*class=UNICODE_LB_CM;*/
+               nolb25_flag=0;
+       } while (i->savedcmcnt--);
+       return 0;
+}
+
+/*
+** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
+** character. If there's now a NU, we found the modified LB25 regexp.
+*/
+
+static int next_lb25_seenophy(unicode_lb_info_t i,
+                             uint8_t uclass)
+{
+       int rc;
+
+       if (uclass == UNICODE_LB_CM)
+       {
+               ++i->savedcmcnt; /* Keep track of CMs, and try again */
+               return 0;
+       }
+
+       if (uclass != UNICODE_LB_NU)
+       {
+               rc=unwind_lb25_seenophy(i);
+
+               if (rc)
+                       return rc;
+
+               return next_def_nolb25(i, uclass, 0);
+       }
+
+       do
+       {
+               rc=RESULT(UNICODE_LB_NONE); /* (OP|HY) feedback */
+
+               if (rc)
+                       return rc;
+       } while (i->savedcmcnt--);
+
+       i->next_handler=next_lb25_seennu;
+       i->end_handler=end_def;
+       i->prevclass=i->prevclass_nsp=uclass;
+       return RESULT(UNICODE_LB_NONE);
+}
+
+/*
+** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up.
+*/
+
+static int end_lb25_seenophy(unicode_lb_info_t i)
+{
+       int rc=unwind_lb25_seenophy(i);
+
+       if (rc == 0)
+               rc=end_def(i);
+       return rc;
+}
+
+/*
+** Seen an NU, modified LB25 regexp.
+*/
+static int next_lb25_seennu(unicode_lb_info_t i, uint8_t uclass)
+{
+       if (uclass == UNICODE_LB_NU || uclass == UNICODE_LB_SY ||
+           uclass == UNICODE_LB_IS)
+       {
+               i->prevclass=i->prevclass_nsp=uclass;
+               return RESULT(UNICODE_LB_NONE);
+       }
+
+       if (uclass == UNICODE_LB_CM)
+               return RESULT(UNICODE_LB_NONE); /* LB9 */
+
+       if (uclass == UNICODE_LB_CL || uclass == UNICODE_LB_CP)
+       {
+               i->prevclass=i->prevclass_nsp=uclass;
+               i->next_handler=next_lb25_seennuclcp;
+               i->end_handler=end_def;
+               return RESULT(UNICODE_LB_NONE);
+       }
+
+       i->next_handler=next_def;
+       i->end_handler=end_def;
+
+       if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
+       {
+               i->prevclass=i->prevclass_nsp=uclass;
+               return RESULT(UNICODE_LB_NONE);
+       }
+
+       return next_def(i, uclass); /* Not a prefix, process normally */
+}
+
+/*
+** Seen CL|CP, in the modified LB25 regexp.
+*/
+static int next_lb25_seennuclcp(unicode_lb_info_t i, uint8_t uclass)
+{
+       if (uclass == UNICODE_LB_CM)
+               return RESULT(UNICODE_LB_NONE); /* LB9 */
+
+       i->next_handler=next_def;
+       i->end_handler=end_def;
+
+       if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
+       {
+               i->prevclass=i->prevclass_nsp=uclass;
+
+               return RESULT(UNICODE_LB_NONE);
+       }
+
+       return next_def(i, uclass);
+}
+
+/******************/
+
+struct unicode_lbc_info {
+       unicode_lb_info_t handle;
+
+       struct unicode_buf buf;
+
+       size_t buf_ptr;
+
+       int (*cb_func)(int, unicode_char, void *);
+       void *cb_arg;
+};
+
+static int unicode_lbc_callback(int value, void *ptr)
+{
+       unicode_lbc_info_t h=(unicode_lbc_info_t)ptr;
+
+       if (h->buf_ptr >= unicode_buf_len(&h->buf))
+       {
+               errno=EINVAL;
+               return -1; /* Shouldn't happen */
+       }
+
+       return (*h->cb_func)(value, unicode_buf_ptr(&h->buf)[h->buf_ptr++],
+                            h->cb_arg);
+}
+
+unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, void *),
+                                   void *cb_arg)
+{
+       unicode_lbc_info_t h=
+               (unicode_lbc_info_t)calloc(1, sizeof(struct unicode_lbc_info));
+
+       if (!h)
+               return NULL;
+
+       h->cb_func=cb_func;
+       h->cb_arg=cb_arg;
+
+       if ((h->handle=unicode_lb_init(unicode_lbc_callback, h)) == NULL)
+       {
+               free(h);
+               return NULL;
+       }
+       unicode_buf_init(&h->buf, (size_t)-1);
+       return h;
+}
+
+void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts)
+{
+       unicode_lb_set_opts(i->handle, opts);
+}
+       
+int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch)
+{
+       if (i->buf_ptr >= unicode_buf_len(&i->buf))
+       {
+               i->buf_ptr=0;
+               unicode_buf_clear(&i->buf);
+       }
+
+       unicode_buf_append(&i->buf, &ch, 1);
+       return unicode_lb_next(i->handle, ch);
+}
+
+int unicode_lbc_end(unicode_lbc_info_t i)
+{
+       int rc=unicode_lb_end(i->handle);
+
+       unicode_buf_deinit(&i->buf);
+       free(i);
+       return rc;
+}