2 ** Copyright 2011 Double Precision, Inc.
3 ** See COPYING for distribution information.
7 #include "unicode_config.h"
16 #include "wordbreaktab_internal.h"
17 #include "wordbreaktab.h"
19 struct unicode_wb_info
{
20 int (*cb_func
)(int, void *);
28 int (*next_handler
)(unicode_wb_info_t
, uint8_t);
29 int (*end_handler
)(unicode_wb_info_t
);
32 static int sot(unicode_wb_info_t i
, uint8_t cl
);
33 static int wb4(unicode_wb_info_t i
);
34 static int wb1and2_done(unicode_wb_info_t i
, uint8_t cl
);
36 static int seen_wb67_handler(unicode_wb_info_t i
, uint8_t cl
);
37 static int seen_wb67_end_handler(unicode_wb_info_t i
);
38 static int wb67_done(unicode_wb_info_t i
, uint8_t prevclass
, uint8_t cl
);
40 static int seen_wb1112_handler(unicode_wb_info_t i
, uint8_t cl
);
41 static int seen_wb1112_end_handler(unicode_wb_info_t i
);
42 static int wb1112_done(unicode_wb_info_t i
, uint8_t prevclass
, uint8_t cl
);
44 unicode_wb_info_t
unicode_wb_init(int (*cb_func
)(int, void *),
47 unicode_wb_info_t i
=calloc(1, sizeof(struct unicode_wb_info
));
58 int unicode_wb_end(unicode_wb_info_t i
)
63 rc
=(*i
->end_handler
)(i
);
71 int unicode_wb_next_cnt(unicode_wb_info_t i
,
72 const unicode_char
*chars
,
79 rc
=unicode_wb_next(i
, *chars
++);
87 int unicode_wb_next(unicode_wb_info_t i
, unicode_char ch
)
89 return (*i
->next_handler
)
90 (i
, unicode_tab_lookup(ch
,
92 sizeof(unicode_indextab
)
93 / sizeof(unicode_indextab
[0]),
99 static int wb4(unicode_wb_info_t i
)
103 while (i
->wb4_cnt
> 0)
108 rc
=(*i
->cb_func
)(0, i
->cb_arg
);
113 static int result(unicode_wb_info_t i
, int flag
)
118 rc
=(*i
->cb_func
)(flag
, i
->cb_arg
);
123 #define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end)
125 static int sot(unicode_wb_info_t i
, uint8_t cl
)
128 SET_HANDLER(wb1and2_done
, NULL
);
130 return result(i
, 1); /* WB1 */
133 static int wb1and2_done(unicode_wb_info_t i
, uint8_t cl
)
135 uint8_t prevclass
=i
->prevclass
;
139 if (prevclass
== UNICODE_WB_CR
&& cl
== UNICODE_WB_LF
)
140 return result(i
, 0); /* WB3 */
145 case UNICODE_WB_Newline
:
146 return result(i
, 1); /* WB3a */
152 case UNICODE_WB_Newline
:
153 return result(i
, 1); /* WB3b */
156 if (cl
== UNICODE_WB_Extend
|| cl
== UNICODE_WB_Format
)
158 i
->prevclass
=prevclass
;
163 if (prevclass
== UNICODE_WB_ALetter
&& cl
== UNICODE_WB_ALetter
)
165 return result(i
, 0); /* WB5 */
168 if (prevclass
== UNICODE_WB_ALetter
&&
169 (cl
== UNICODE_WB_MidLetter
|| cl
== UNICODE_WB_MidNumLet
))
172 SET_HANDLER(seen_wb67_handler
, seen_wb67_end_handler
);
176 return wb67_done(i
, prevclass
, cl
);
180 ** ALetter (MidLetter | MidNumLet ) ?
184 ** Seen ALetter (MidLetter | MidNumLet), with the second character's status
188 static int seen_wb67_handler(unicode_wb_info_t i
, uint8_t cl
)
194 if (cl
== UNICODE_WB_Extend
|| cl
== UNICODE_WB_Format
)
200 extra_cnt
=i
->wb4_extra_cnt
;
203 ** Reset the handler to the default, then check WB6
206 SET_HANDLER(wb1and2_done
, NULL
);
208 if (cl
== UNICODE_WB_ALetter
)
210 rc
=result(i
, 0); /* WB6 */
211 i
->wb4_cnt
=extra_cnt
;
214 rc
=result(i
, 0); /* WB7 */
221 prevclass
=i
->prevclass
; /* This was the second character */
224 ** Process the second character, starting with WB7
227 rc
=wb67_done(i
, UNICODE_WB_ALetter
, prevclass
);
229 i
->prevclass
=prevclass
;
230 i
->wb4_cnt
=extra_cnt
;
233 rc
=(*i
->next_handler
)(i
, cl
);
234 /* Process the current char now */
240 ** Seen ALetter (MidLetter | MidNumLet), with the second character's status
241 ** not returned yet, and now sot.
244 static int seen_wb67_end_handler(unicode_wb_info_t i
)
247 size_t extra_cnt
=i
->wb4_extra_cnt
;
250 ** Process the second character, starting with WB7.
253 rc
=wb67_done(i
, UNICODE_WB_ALetter
, i
->prevclass
);
254 i
->wb4_cnt
=extra_cnt
;
261 static int wb67_done(unicode_wb_info_t i
, uint8_t prevclass
, uint8_t cl
)
263 if (prevclass
== UNICODE_WB_Numeric
&& cl
== UNICODE_WB_Numeric
)
264 return result(i
, 0); /* WB8 */
266 if (prevclass
== UNICODE_WB_ALetter
&& cl
== UNICODE_WB_Numeric
)
267 return result(i
, 0); /* WB9 */
269 if (prevclass
== UNICODE_WB_Numeric
&& cl
== UNICODE_WB_ALetter
)
270 return result(i
, 0); /* WB10 */
273 if (prevclass
== UNICODE_WB_Numeric
&&
274 (cl
== UNICODE_WB_MidNum
|| cl
== UNICODE_WB_MidNumLet
))
277 SET_HANDLER(seen_wb1112_handler
, seen_wb1112_end_handler
);
281 return wb1112_done(i
, prevclass
, cl
);
285 ** Numeric (MidNum | MidNumLet ) ?
289 ** Seen Numeric (MidNum | MidNumLet), with the second character's status
293 static int seen_wb1112_handler(unicode_wb_info_t i
, uint8_t cl
)
299 if (cl
== UNICODE_WB_Extend
|| cl
== UNICODE_WB_Format
)
305 extra_cnt
=i
->wb4_extra_cnt
;
308 ** Reset the handler to the default, then check WB6
311 SET_HANDLER(wb1and2_done
, NULL
);
313 if (cl
== UNICODE_WB_Numeric
)
315 rc
=result(i
, 0); /* WB11 */
316 i
->wb4_cnt
=extra_cnt
;
319 rc
=result(i
, 0); /* WB12 */
326 prevclass
=i
->prevclass
; /* This was the second character */
329 ** Process the second character, starting with WB7
332 rc
=wb1112_done(i
, UNICODE_WB_Numeric
, prevclass
);
334 i
->prevclass
=prevclass
;
335 i
->wb4_cnt
=extra_cnt
;
338 rc
=(*i
->next_handler
)(i
, cl
);
339 /* Process the current char now */
345 ** Seen Numeric (MidNum | MidNumLet), with the second character's status
346 ** not returned yet, and now sot.
349 static int seen_wb1112_end_handler(unicode_wb_info_t i
)
352 size_t extra_cnt
=i
->wb4_extra_cnt
;
355 ** Process the second character, starting with WB11.
358 rc
=wb1112_done(i
, UNICODE_WB_Numeric
, i
->prevclass
);
359 i
->wb4_cnt
=extra_cnt
;
365 static int wb1112_done(unicode_wb_info_t i
, uint8_t prevclass
, uint8_t cl
)
367 if (prevclass
== UNICODE_WB_Katakana
&&
368 cl
== UNICODE_WB_Katakana
)
369 return result(i
, 0); /* WB13 */
372 case UNICODE_WB_ALetter
:
373 case UNICODE_WB_Numeric
:
374 case UNICODE_WB_Katakana
:
375 case UNICODE_WB_ExtendNumLet
:
376 if (cl
== UNICODE_WB_ExtendNumLet
)
377 return result(i
, 0); /* WB13a */
380 if (prevclass
== UNICODE_WB_ExtendNumLet
)
382 case UNICODE_WB_ALetter
:
383 case UNICODE_WB_Numeric
:
384 case UNICODE_WB_Katakana
:
385 return result(i
, 0); /* WB13b */
388 return result(i
, 1); /* WB14 */
391 /* --------------------------------------------------------------------- */
393 struct unicode_wbscan_info
{
394 unicode_wb_info_t wb_handle
;
400 static int unicode_wbscan_callback(int, void *);
402 unicode_wbscan_info_t
unicode_wbscan_init()
404 unicode_wbscan_info_t i
=calloc(1, sizeof(struct unicode_wbscan_info
));
409 if ((i
->wb_handle
=unicode_wb_init(unicode_wbscan_callback
, i
)) == NULL
)
418 int unicode_wbscan_next(unicode_wbscan_info_t i
, unicode_char ch
)
421 unicode_wb_next(i
->wb_handle
, ch
);
426 size_t unicode_wbscan_end(unicode_wbscan_info_t i
)
430 unicode_wb_end(i
->wb_handle
);
437 static int unicode_wbscan_callback(int flag
, void *arg
)
439 unicode_wbscan_info_t i
=(unicode_wbscan_info_t
)arg
;
441 if (flag
&& i
->cnt
> 0)