2 ** Copyright 2011 Double Precision, Inc.
3 ** See COPYING for distribution information.
7 #include "unicode_config.h"
16 #include "linebreaktab_internal.h"
18 #include "linebreaktab.h"
20 #define UNICODE_LB_SOT 0xFF
22 struct unicode_lb_info
{
23 int (*cb_func
)(int, void *);
32 uint8_t prevclass_nsp
;
34 int (*next_handler
)(struct unicode_lb_info
*, uint8_t);
35 int (*end_handler
)(struct unicode_lb_info
*);
39 /* http://www.unicode.org/reports/tr14/#Algorithm */
41 static int next_def(unicode_lb_info_t
, uint8_t);
42 static int end_def(unicode_lb_info_t
);
44 static int next_lb25_seenophy(unicode_lb_info_t
, uint8_t);
45 static int end_lb25_seenophy(unicode_lb_info_t
);
47 static int next_lb25_seennu(unicode_lb_info_t
, uint8_t);
49 static int next_lb25_seennuclcp(unicode_lb_info_t
, uint8_t);
51 static void unicode_lb_reset(unicode_lb_info_t i
)
53 i
->prevclass
=i
->prevclass_nsp
=UNICODE_LB_SOT
;
54 i
->next_handler
=next_def
;
55 i
->end_handler
=end_def
;
58 unicode_lb_info_t
unicode_lb_init(int (*cb_func
)(int, void *),
61 unicode_lb_info_t i
=calloc(1, sizeof(struct unicode_lb_info
));
70 int unicode_lb_end(unicode_lb_info_t i
)
72 int rc
=(*i
->end_handler
)(i
);
78 void unicode_lb_set_opts(unicode_lb_info_t i
, int opts
)
83 /* Default end handler has nothing to do */
85 static int end_def(unicode_lb_info_t i
)
90 #define RESULT(x) (*i->cb_func)((x), i->cb_arg)
92 int unicode_lb_next_cnt(unicode_lb_info_t i
,
93 const unicode_char
*chars
,
98 int rc
=unicode_lb_next(i
, *chars
);
109 int unicode_lb_lookup(unicode_char ch
)
111 return unicode_tab_lookup(ch
,
113 sizeof(unicode_indextab
)
114 / sizeof(unicode_indextab
[0]),
117 UNICODE_LB_AL
/* XX, LB1 */);
120 int unicode_lb_next(unicode_lb_info_t i
,
123 return (*i
->next_handler
)(i
, (i
->opts
& UNICODE_LB_OPT_DASHWJ
) &&
124 (ch
== 0x2012 || ch
== 0x2013)
125 ? UNICODE_LB_WJ
:unicode_lb_lookup(ch
));
128 static int next_def_nolb25(unicode_lb_info_t i
,
133 ** Default logic for next unicode char.
135 static int next_def(unicode_lb_info_t i
,
138 return next_def_nolb25(i
, uclass
, 0);
141 static int next_def_nolb25(unicode_lb_info_t i
,
144 /* Flag -- recursively invoked after discarding LB25 */
148 /* Retrieve the previous unicode character's linebreak class. */
150 uint8_t prevclass
=i
->prevclass
;
151 uint8_t prevclass_nsp
=i
->prevclass_nsp
;
153 /* Save this unicode char's linebreak class, for the next goaround */
156 if (uclass
!= UNICODE_LB_SP
)
157 i
->prevclass_nsp
=uclass
;
159 if (uclass
== UNICODE_LB_NU
)
160 i
->next_handler
=next_lb25_seennu
; /* LB25 */
162 if (prevclass
== UNICODE_LB_SOT
)
164 if (uclass
== UNICODE_LB_CM
) /* LB9 */
165 i
->prevclass
=i
->prevclass_nsp
=uclass
=UNICODE_LB_AL
;
167 return RESULT(UNICODE_LB_NONE
); /* LB2 */
170 if (prevclass
== UNICODE_LB_CR
&& uclass
== UNICODE_LB_LF
)
171 return RESULT(UNICODE_LB_NONE
); /* LB5 */
179 if (uclass
== UNICODE_LB_CM
)
181 i
->prevclass
=i
->prevclass_nsp
=uclass
=UNICODE_LB_AL
;
185 return RESULT(UNICODE_LB_MANDATORY
); /* LB4, LB5 */
189 if (uclass
== UNICODE_LB_CM
)
190 i
->prevclass
=i
->prevclass_nsp
=uclass
=UNICODE_LB_AL
;
209 return RESULT(UNICODE_LB_NONE
);
214 if (prevclass_nsp
== UNICODE_LB_ZW
)
215 return RESULT(UNICODE_LB_ALLOWED
); /* LB8 */
217 if (uclass
== UNICODE_LB_CM
)
219 i
->prevclass
=prevclass
;
220 i
->prevclass_nsp
=prevclass_nsp
;
221 return RESULT(UNICODE_LB_NONE
); /* LB9 */
224 if (prevclass
== UNICODE_LB_WJ
|| uclass
== UNICODE_LB_WJ
)
225 return RESULT(UNICODE_LB_NONE
); /* LB11 */
227 if (prevclass
== UNICODE_LB_GL
)
228 return RESULT(UNICODE_LB_NONE
); /* LB12 */
230 if (uclass
== UNICODE_LB_GL
&&
231 prevclass
!= UNICODE_LB_SP
&&
232 prevclass
!= UNICODE_LB_BA
&&
233 prevclass
!= UNICODE_LB_HY
)
234 return RESULT(UNICODE_LB_NONE
); /* LB12a */
239 if (i
->opts
& UNICODE_LB_OPT_SYBREAK
)
241 if (prevclass
== UNICODE_LB_SP
)
242 return RESULT(UNICODE_LB_ALLOWED
);
249 return RESULT(UNICODE_LB_NONE
); /* LB13 */
254 if ((i
->opts
& UNICODE_LB_OPT_SYBREAK
) && prevclass
== UNICODE_LB_SY
)
259 return RESULT(UNICODE_LB_NONE
);
262 if (prevclass_nsp
== UNICODE_LB_OP
)
263 return RESULT(UNICODE_LB_NONE
); /* LB14 */
265 if (prevclass_nsp
== UNICODE_LB_QU
&& uclass
== UNICODE_LB_OP
)
266 return RESULT(UNICODE_LB_NONE
); /* LB15 */
268 if ((prevclass_nsp
== UNICODE_LB_CL
|| prevclass_nsp
== UNICODE_LB_CP
)
269 && uclass
== UNICODE_LB_NS
)
270 return RESULT(UNICODE_LB_NONE
); /* LB16 */
272 if (prevclass_nsp
== UNICODE_LB_B2
&& uclass
== UNICODE_LB_B2
)
273 return RESULT(UNICODE_LB_NONE
); /* LB17 */
275 if (prevclass
== UNICODE_LB_SP
)
276 return RESULT(UNICODE_LB_ALLOWED
); /* LB18 */
278 if (uclass
== UNICODE_LB_QU
|| prevclass
== UNICODE_LB_QU
)
279 return RESULT(UNICODE_LB_NONE
); /* LB19 */
281 if (uclass
== UNICODE_LB_CB
|| prevclass
== UNICODE_LB_CB
)
282 return RESULT(UNICODE_LB_ALLOWED
); /* LB20 */
290 return RESULT(UNICODE_LB_NONE
);
295 if (prevclass
== UNICODE_LB_BB
)
296 return RESULT(UNICODE_LB_NONE
);
298 if (uclass
== UNICODE_LB_IN
)
304 return RESULT(UNICODE_LB_NONE
); /* LB22 */
310 if (prevclass
== UNICODE_LB_ID
&& uclass
== UNICODE_LB_PO
)
311 return RESULT(UNICODE_LB_NONE
); /* LB23 */
312 if (prevclass
== UNICODE_LB_AL
&& uclass
== UNICODE_LB_NU
)
313 return RESULT(UNICODE_LB_NONE
); /* LB23 */
315 if (prevclass
== UNICODE_LB_NU
&& uclass
== UNICODE_LB_AL
)
316 return RESULT(UNICODE_LB_NONE
); /* LB23 */
319 if (prevclass
== UNICODE_LB_PR
&& uclass
== UNICODE_LB_ID
)
320 return RESULT(UNICODE_LB_NONE
); /* LB24 */
321 if (prevclass
== UNICODE_LB_PR
&& uclass
== UNICODE_LB_AL
)
322 return RESULT(UNICODE_LB_NONE
); /* LB24 */
323 if (prevclass
== UNICODE_LB_PO
&& uclass
== UNICODE_LB_AL
)
324 return RESULT(UNICODE_LB_NONE
); /* LB24 */
326 if ((i
->opts
& UNICODE_LB_OPT_PRBREAK
) && uclass
== UNICODE_LB_PR
)
331 return RESULT(UNICODE_LB_NONE
);
335 (prevclass
== UNICODE_LB_PR
|| prevclass
== UNICODE_LB_PO
))
337 if (uclass
== UNICODE_LB_NU
)
338 return RESULT(UNICODE_LB_NONE
); /* LB25 */
340 if (uclass
== UNICODE_LB_OP
|| uclass
== UNICODE_LB_HY
)
342 i
->prevclass
=prevclass
;
343 i
->prevclass_nsp
=prevclass_nsp
;
345 i
->savedclass
=uclass
;
347 i
->next_handler
=next_lb25_seenophy
;
348 i
->end_handler
=end_lb25_seenophy
;
353 if ((prevclass
== UNICODE_LB_OP
|| prevclass
== UNICODE_LB_HY
) &&
354 uclass
== UNICODE_LB_NU
)
355 return RESULT(UNICODE_LB_NONE
); /* LB25 */
359 if (prevclass
== UNICODE_LB_JL
)
365 return RESULT(UNICODE_LB_NONE
); /* LB26 */
370 if ((prevclass
== UNICODE_LB_JV
||
371 prevclass
== UNICODE_LB_H2
) &&
372 (uclass
== UNICODE_LB_JV
||
373 uclass
== UNICODE_LB_JT
))
374 return RESULT(UNICODE_LB_NONE
); /* LB26 */
376 if ((prevclass
== UNICODE_LB_JT
||
377 prevclass
== UNICODE_LB_H3
) &&
378 uclass
== UNICODE_LB_JT
)
379 return RESULT(UNICODE_LB_NONE
); /* LB26 */
388 if (uclass
== UNICODE_LB_IN
|| uclass
== UNICODE_LB_PO
)
389 return RESULT(UNICODE_LB_NONE
); /* LB27 */
400 if (prevclass
== UNICODE_LB_PR
)
401 return RESULT(UNICODE_LB_NONE
); /* LB27 */
406 if (prevclass
== UNICODE_LB_AL
&& uclass
== UNICODE_LB_AL
)
407 return RESULT(UNICODE_LB_NONE
); /* LB28 */
409 if (prevclass
== UNICODE_LB_IS
&& uclass
== UNICODE_LB_AL
)
410 return RESULT(UNICODE_LB_NONE
); /* LB29 */
412 if ((prevclass
== UNICODE_LB_AL
|| prevclass
== UNICODE_LB_NU
) &&
413 uclass
== UNICODE_LB_OP
)
414 return RESULT(UNICODE_LB_NONE
); /* LB30 */
416 if ((uclass
== UNICODE_LB_AL
|| uclass
== UNICODE_LB_NU
) &&
417 prevclass
== UNICODE_LB_CP
)
418 return RESULT(UNICODE_LB_NONE
); /* LB30 */
420 return RESULT(UNICODE_LB_ALLOWED
); /* LB31 */
424 ** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
425 ** character, but NU did not follow. Backtrack.
428 static int unwind_lb25_seenophy(unicode_lb_info_t i
)
432 /*uint8_t class=i->savedclass;*/
435 i
->next_handler
=next_def
;
436 i
->end_handler
=end_def
;
440 rc
=next_def_nolb25(i
, i
->savedclass
, nolb25_flag
);
445 /*class=UNICODE_LB_CM;*/
447 } while (i
->savedcmcnt
--);
452 ** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
453 ** character. If there's now a NU, we found the modified LB25 regexp.
456 static int next_lb25_seenophy(unicode_lb_info_t i
,
461 if (uclass
== UNICODE_LB_CM
)
463 ++i
->savedcmcnt
; /* Keep track of CMs, and try again */
467 if (uclass
!= UNICODE_LB_NU
)
469 rc
=unwind_lb25_seenophy(i
);
474 return next_def_nolb25(i
, uclass
, 0);
479 rc
=RESULT(UNICODE_LB_NONE
); /* (OP|HY) feedback */
483 } while (i
->savedcmcnt
--);
485 i
->next_handler
=next_lb25_seennu
;
486 i
->end_handler
=end_def
;
487 i
->prevclass
=i
->prevclass_nsp
=uclass
;
488 return RESULT(UNICODE_LB_NONE
);
492 ** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up.
495 static int end_lb25_seenophy(unicode_lb_info_t i
)
497 int rc
=unwind_lb25_seenophy(i
);
505 ** Seen an NU, modified LB25 regexp.
507 static int next_lb25_seennu(unicode_lb_info_t i
, uint8_t uclass
)
509 if (uclass
== UNICODE_LB_NU
|| uclass
== UNICODE_LB_SY
||
510 uclass
== UNICODE_LB_IS
)
512 i
->prevclass
=i
->prevclass_nsp
=uclass
;
513 return RESULT(UNICODE_LB_NONE
);
516 if (uclass
== UNICODE_LB_CM
)
517 return RESULT(UNICODE_LB_NONE
); /* LB9 */
519 if (uclass
== UNICODE_LB_CL
|| uclass
== UNICODE_LB_CP
)
521 i
->prevclass
=i
->prevclass_nsp
=uclass
;
522 i
->next_handler
=next_lb25_seennuclcp
;
523 i
->end_handler
=end_def
;
524 return RESULT(UNICODE_LB_NONE
);
527 i
->next_handler
=next_def
;
528 i
->end_handler
=end_def
;
530 if (uclass
== UNICODE_LB_PR
|| uclass
== UNICODE_LB_PO
)
532 i
->prevclass
=i
->prevclass_nsp
=uclass
;
533 return RESULT(UNICODE_LB_NONE
);
536 return next_def(i
, uclass
); /* Not a prefix, process normally */
540 ** Seen CL|CP, in the modified LB25 regexp.
542 static int next_lb25_seennuclcp(unicode_lb_info_t i
, uint8_t uclass
)
544 if (uclass
== UNICODE_LB_CM
)
545 return RESULT(UNICODE_LB_NONE
); /* LB9 */
547 i
->next_handler
=next_def
;
548 i
->end_handler
=end_def
;
550 if (uclass
== UNICODE_LB_PR
|| uclass
== UNICODE_LB_PO
)
552 i
->prevclass
=i
->prevclass_nsp
=uclass
;
554 return RESULT(UNICODE_LB_NONE
);
557 return next_def(i
, uclass
);
562 struct unicode_lbc_info
{
563 unicode_lb_info_t handle
;
565 struct unicode_buf buf
;
569 int (*cb_func
)(int, unicode_char
, void *);
573 static int unicode_lbc_callback(int value
, void *ptr
)
575 unicode_lbc_info_t h
=(unicode_lbc_info_t
)ptr
;
577 if (h
->buf_ptr
>= unicode_buf_len(&h
->buf
))
580 return -1; /* Shouldn't happen */
583 return (*h
->cb_func
)(value
, unicode_buf_ptr(&h
->buf
)[h
->buf_ptr
++],
587 unicode_lbc_info_t
unicode_lbc_init(int (*cb_func
)(int, unicode_char
, void *),
590 unicode_lbc_info_t h
=
591 (unicode_lbc_info_t
)calloc(1, sizeof(struct unicode_lbc_info
));
599 if ((h
->handle
=unicode_lb_init(unicode_lbc_callback
, h
)) == NULL
)
604 unicode_buf_init(&h
->buf
, (size_t)-1);
608 void unicode_lbc_set_opts(unicode_lbc_info_t i
, int opts
)
610 unicode_lb_set_opts(i
->handle
, opts
);
613 int unicode_lbc_next(unicode_lbc_info_t i
, unicode_char ch
)
615 if (i
->buf_ptr
>= unicode_buf_len(&i
->buf
))
618 unicode_buf_clear(&i
->buf
);
621 unicode_buf_append(&i
->buf
, &ch
, 1);
622 return unicode_lb_next(i
->handle
, ch
);
625 int unicode_lbc_end(unicode_lbc_info_t i
)
627 int rc
=unicode_lb_end(i
->handle
);
629 unicode_buf_deinit(&i
->buf
);