Imported Upstream version 0.66.1
[hcoop/debian/courier-authlib.git] / libs / unicode / unicode_wordbreak.c
1 /*
2 ** Copyright 2011 Double Precision, Inc.
3 ** See COPYING for distribution information.
4 **
5 */
6
7 #include "unicode_config.h"
8 #include "unicode.h"
9
10 #include <unistd.h>
11 #include <stdint.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <errno.h>
15
16 #include "wordbreaktab_internal.h"
17 #include "wordbreaktab.h"
18
19 struct unicode_wb_info {
20 int (*cb_func)(int, void *);
21 void *cb_arg;
22
23 uint8_t prevclass;
24 size_t wb4_cnt;
25
26 size_t wb4_extra_cnt;
27
28 int (*next_handler)(unicode_wb_info_t, uint8_t);
29 int (*end_handler)(unicode_wb_info_t);
30 };
31
32 static int sot(unicode_wb_info_t i, uint8_t cl);
33 static int wb4(unicode_wb_info_t i);
34 static int wb1and2_done(unicode_wb_info_t i, uint8_t cl);
35
36 static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl);
37 static int seen_wb67_end_handler(unicode_wb_info_t i);
38 static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
39
40 static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl);
41 static int seen_wb1112_end_handler(unicode_wb_info_t i);
42 static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
43
44 unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
45 void *cb_arg)
46 {
47 unicode_wb_info_t i=calloc(1, sizeof(struct unicode_wb_info));
48
49 if (!i)
50 return NULL;
51
52 i->next_handler=sot;
53 i->cb_func=cb_func;
54 i->cb_arg=cb_arg;
55 return i;
56 }
57
58 int unicode_wb_end(unicode_wb_info_t i)
59 {
60 int rc;
61
62 if (i->end_handler)
63 rc=(*i->end_handler)(i);
64 else
65 rc=wb4(i);
66
67 free(i);
68 return rc;
69 }
70
71 int unicode_wb_next_cnt(unicode_wb_info_t i,
72 const unicode_char *chars,
73 size_t cnt)
74 {
75 int rc;
76
77 while (cnt)
78 {
79 rc=unicode_wb_next(i, *chars++);
80 --cnt;
81 if (rc)
82 return rc;
83 }
84 return 0;
85 }
86
87 int unicode_wb_next(unicode_wb_info_t i, unicode_char ch)
88 {
89 return (*i->next_handler)
90 (i, unicode_tab_lookup(ch,
91 unicode_indextab,
92 sizeof(unicode_indextab)
93 / sizeof(unicode_indextab[0]),
94 unicode_rangetab,
95 unicode_classtab,
96 UNICODE_WB_OTHER));
97 }
98
99 static int wb4(unicode_wb_info_t i)
100 {
101 int rc=0;
102
103 while (i->wb4_cnt > 0)
104 {
105 --i->wb4_cnt;
106
107 if (rc == 0)
108 rc=(*i->cb_func)(0, i->cb_arg);
109 }
110 return rc;
111 }
112
113 static int result(unicode_wb_info_t i, int flag)
114 {
115 int rc=wb4(i);
116
117 if (rc == 0)
118 rc=(*i->cb_func)(flag, i->cb_arg);
119
120 return rc;
121 }
122
123 #define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end)
124
125 static int sot(unicode_wb_info_t i, uint8_t cl)
126 {
127 i->prevclass=cl;
128 SET_HANDLER(wb1and2_done, NULL);
129
130 return result(i, 1); /* WB1 */
131 }
132
133 static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)
134 {
135 uint8_t prevclass=i->prevclass;
136
137 i->prevclass=cl;
138
139 if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF)
140 return result(i, 0); /* WB3 */
141
142 switch (prevclass) {
143 case UNICODE_WB_CR:
144 case UNICODE_WB_LF:
145 case UNICODE_WB_Newline:
146 return result(i, 1); /* WB3a */
147 }
148
149 switch (cl) {
150 case UNICODE_WB_CR:
151 case UNICODE_WB_LF:
152 case UNICODE_WB_Newline:
153 return result(i, 1); /* WB3b */
154 }
155
156 if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
157 {
158 i->prevclass=prevclass;
159 ++i->wb4_cnt;
160 return 0; /* WB4 */
161 }
162
163 if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter)
164 {
165 return result(i, 0); /* WB5 */
166 }
167
168 if (prevclass == UNICODE_WB_ALetter &&
169 (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet))
170 {
171 i->wb4_extra_cnt=0;
172 SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler);
173 return 0;
174 }
175
176 return wb67_done(i, prevclass, cl);
177 }
178
179 /*
180 ** ALetter (MidLetter | MidNumLet ) ?
181 **
182 ** prevclass cl
183 **
184 ** Seen ALetter (MidLetter | MidNumLet), with the second character's status
185 ** not returned yet.
186 */
187
188 static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
189 {
190 int rc;
191 uint8_t prevclass;
192 size_t extra_cnt;
193
194 if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
195 {
196 ++i->wb4_extra_cnt;
197 return 0;
198 }
199
200 extra_cnt=i->wb4_extra_cnt;
201
202 /*
203 ** Reset the handler to the default, then check WB6
204 */
205
206 SET_HANDLER(wb1and2_done, NULL);
207
208 if (cl == UNICODE_WB_ALetter)
209 {
210 rc=result(i, 0); /* WB6 */
211 i->wb4_cnt=extra_cnt;
212
213 if (rc == 0)
214 rc=result(i, 0); /* WB7 */
215
216 i->prevclass=cl;
217
218 return rc;
219 }
220
221 prevclass=i->prevclass; /* This was the second character */
222
223 /*
224 ** Process the second character, starting with WB7
225 */
226
227 rc=wb67_done(i, UNICODE_WB_ALetter, prevclass);
228
229 i->prevclass=prevclass;
230 i->wb4_cnt=extra_cnt;
231
232 if (rc == 0)
233 rc=(*i->next_handler)(i, cl);
234 /* Process the current char now */
235
236 return rc;
237 }
238
239 /*
240 ** Seen ALetter (MidLetter | MidNumLet), with the second character's status
241 ** not returned yet, and now sot.
242 */
243
244 static int seen_wb67_end_handler(unicode_wb_info_t i)
245 {
246 int rc;
247 size_t extra_cnt=i->wb4_extra_cnt;
248
249 /*
250 ** Process the second character, starting with WB7.
251 */
252
253 rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass);
254 i->wb4_cnt=extra_cnt;
255 if (rc == 0)
256 rc=wb4(i);
257 return rc;
258 }
259
260
261 static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
262 {
263 if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric)
264 return result(i, 0); /* WB8 */
265
266 if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric)
267 return result(i, 0); /* WB9 */
268
269 if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter)
270 return result(i, 0); /* WB10 */
271
272
273 if (prevclass == UNICODE_WB_Numeric &&
274 (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet))
275 {
276 i->wb4_extra_cnt=0;
277 SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler);
278 return 0;
279 }
280
281 return wb1112_done(i, prevclass, cl);
282 }
283
284 /*
285 ** Numeric (MidNum | MidNumLet ) ?
286 **
287 ** prevclass cl
288 **
289 ** Seen Numeric (MidNum | MidNumLet), with the second character's status
290 ** not returned yet.
291 */
292
293 static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl)
294 {
295 int rc;
296 uint8_t prevclass;
297 size_t extra_cnt;
298
299 if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
300 {
301 ++i->wb4_extra_cnt;
302 return 0;
303 }
304
305 extra_cnt=i->wb4_extra_cnt;
306
307 /*
308 ** Reset the handler to the default, then check WB6
309 */
310
311 SET_HANDLER(wb1and2_done, NULL);
312
313 if (cl == UNICODE_WB_Numeric)
314 {
315 rc=result(i, 0); /* WB11 */
316 i->wb4_cnt=extra_cnt;
317
318 if (rc == 0)
319 rc=result(i, 0); /* WB12 */
320
321 i->prevclass=cl;
322
323 return rc;
324 }
325
326 prevclass=i->prevclass; /* This was the second character */
327
328 /*
329 ** Process the second character, starting with WB7
330 */
331
332 rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass);
333
334 i->prevclass=prevclass;
335 i->wb4_cnt=extra_cnt;
336
337 if (rc == 0)
338 rc=(*i->next_handler)(i, cl);
339 /* Process the current char now */
340
341 return rc;
342 }
343
344 /*
345 ** Seen Numeric (MidNum | MidNumLet), with the second character's status
346 ** not returned yet, and now sot.
347 */
348
349 static int seen_wb1112_end_handler(unicode_wb_info_t i)
350 {
351 int rc;
352 size_t extra_cnt=i->wb4_extra_cnt;
353
354 /*
355 ** Process the second character, starting with WB11.
356 */
357
358 rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass);
359 i->wb4_cnt=extra_cnt;
360 if (rc == 0)
361 rc=wb4(i);
362 return rc;
363 }
364
365 static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
366 {
367 if (prevclass == UNICODE_WB_Katakana &&
368 cl == UNICODE_WB_Katakana)
369 return result(i, 0); /* WB13 */
370
371 switch (prevclass) {
372 case UNICODE_WB_ALetter:
373 case UNICODE_WB_Numeric:
374 case UNICODE_WB_Katakana:
375 case UNICODE_WB_ExtendNumLet:
376 if (cl == UNICODE_WB_ExtendNumLet)
377 return result(i, 0); /* WB13a */
378 }
379
380 if (prevclass == UNICODE_WB_ExtendNumLet)
381 switch (cl) {
382 case UNICODE_WB_ALetter:
383 case UNICODE_WB_Numeric:
384 case UNICODE_WB_Katakana:
385 return result(i, 0); /* WB13b */
386 }
387
388 return result(i, 1); /* WB14 */
389 }
390
391 /* --------------------------------------------------------------------- */
392
393 struct unicode_wbscan_info {
394 unicode_wb_info_t wb_handle;
395
396 int found;
397 size_t cnt;
398 };
399
400 static int unicode_wbscan_callback(int, void *);
401
402 unicode_wbscan_info_t unicode_wbscan_init()
403 {
404 unicode_wbscan_info_t i=calloc(1, sizeof(struct unicode_wbscan_info));
405
406 if (!i)
407 return NULL;
408
409 if ((i->wb_handle=unicode_wb_init(unicode_wbscan_callback, i)) == NULL)
410 {
411 free(i);
412 return NULL;
413 }
414
415 return i;
416 }
417
418 int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch)
419 {
420 if (!i->found)
421 unicode_wb_next(i->wb_handle, ch);
422
423 return i->found;
424 }
425
426 size_t unicode_wbscan_end(unicode_wbscan_info_t i)
427 {
428 size_t n;
429
430 unicode_wb_end(i->wb_handle);
431
432 n=i->cnt;
433 free(i);
434 return n;
435 }
436
437 static int unicode_wbscan_callback(int flag, void *arg)
438 {
439 unicode_wbscan_info_t i=(unicode_wbscan_info_t)arg;
440
441 if (flag && i->cnt > 0)
442 i->found=1;
443
444 if (!i->found)
445 ++i->cnt;
446 return 0;
447 }
448