Imported Debian patch 0.66.1-1
[hcoop/debian/courier-authlib.git] / libs / unicode / unicode_wordbreak.c
CommitLineData
b0322a85
CE
1/*
2** Copyright 2011 Double Precision, Inc.
3** See COPYING for distribution information.
4**
5*/
6
7#include "unicode_config.h"
8#include "unicode.h"
9
10#include <unistd.h>
11#include <stdint.h>
12#include <stdlib.h>
13#include <string.h>
14#include <errno.h>
15
16#include "wordbreaktab_internal.h"
17#include "wordbreaktab.h"
18
19struct unicode_wb_info {
20 int (*cb_func)(int, void *);
21 void *cb_arg;
22
23 uint8_t prevclass;
24 size_t wb4_cnt;
25
26 size_t wb4_extra_cnt;
27
28 int (*next_handler)(unicode_wb_info_t, uint8_t);
29 int (*end_handler)(unicode_wb_info_t);
30};
31
32static int sot(unicode_wb_info_t i, uint8_t cl);
33static int wb4(unicode_wb_info_t i);
34static int wb1and2_done(unicode_wb_info_t i, uint8_t cl);
35
36static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl);
37static int seen_wb67_end_handler(unicode_wb_info_t i);
38static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
39
40static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl);
41static int seen_wb1112_end_handler(unicode_wb_info_t i);
42static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl);
43
44unicode_wb_info_t unicode_wb_init(int (*cb_func)(int, void *),
45 void *cb_arg)
46{
47 unicode_wb_info_t i=calloc(1, sizeof(struct unicode_wb_info));
48
49 if (!i)
50 return NULL;
51
52 i->next_handler=sot;
53 i->cb_func=cb_func;
54 i->cb_arg=cb_arg;
55 return i;
56}
57
58int unicode_wb_end(unicode_wb_info_t i)
59{
60 int rc;
61
62 if (i->end_handler)
63 rc=(*i->end_handler)(i);
64 else
65 rc=wb4(i);
66
67 free(i);
68 return rc;
69}
70
71int unicode_wb_next_cnt(unicode_wb_info_t i,
72 const unicode_char *chars,
73 size_t cnt)
74{
75 int rc;
76
77 while (cnt)
78 {
79 rc=unicode_wb_next(i, *chars++);
80 --cnt;
81 if (rc)
82 return rc;
83 }
84 return 0;
85}
86
87int unicode_wb_next(unicode_wb_info_t i, unicode_char ch)
88{
89 return (*i->next_handler)
90 (i, unicode_tab_lookup(ch,
91 unicode_indextab,
92 sizeof(unicode_indextab)
93 / sizeof(unicode_indextab[0]),
94 unicode_rangetab,
95 unicode_classtab,
96 UNICODE_WB_OTHER));
97}
98
99static int wb4(unicode_wb_info_t i)
100{
101 int rc=0;
102
103 while (i->wb4_cnt > 0)
104 {
105 --i->wb4_cnt;
106
107 if (rc == 0)
108 rc=(*i->cb_func)(0, i->cb_arg);
109 }
110 return rc;
111}
112
113static int result(unicode_wb_info_t i, int flag)
114{
115 int rc=wb4(i);
116
117 if (rc == 0)
118 rc=(*i->cb_func)(flag, i->cb_arg);
119
120 return rc;
121}
122
123#define SET_HANDLER(next,end) (i->next_handler=next, i->end_handler=end)
124
125static int sot(unicode_wb_info_t i, uint8_t cl)
126{
127 i->prevclass=cl;
128 SET_HANDLER(wb1and2_done, NULL);
129
130 return result(i, 1); /* WB1 */
131}
132
133static int wb1and2_done(unicode_wb_info_t i, uint8_t cl)
134{
135 uint8_t prevclass=i->prevclass;
136
137 i->prevclass=cl;
138
139 if (prevclass == UNICODE_WB_CR && cl == UNICODE_WB_LF)
140 return result(i, 0); /* WB3 */
141
142 switch (prevclass) {
143 case UNICODE_WB_CR:
144 case UNICODE_WB_LF:
145 case UNICODE_WB_Newline:
146 return result(i, 1); /* WB3a */
147 }
148
149 switch (cl) {
150 case UNICODE_WB_CR:
151 case UNICODE_WB_LF:
152 case UNICODE_WB_Newline:
153 return result(i, 1); /* WB3b */
154 }
155
156 if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
157 {
158 i->prevclass=prevclass;
159 ++i->wb4_cnt;
160 return 0; /* WB4 */
161 }
162
163 if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_ALetter)
164 {
165 return result(i, 0); /* WB5 */
166 }
167
168 if (prevclass == UNICODE_WB_ALetter &&
169 (cl == UNICODE_WB_MidLetter || cl == UNICODE_WB_MidNumLet))
170 {
171 i->wb4_extra_cnt=0;
172 SET_HANDLER(seen_wb67_handler, seen_wb67_end_handler);
173 return 0;
174 }
175
176 return wb67_done(i, prevclass, cl);
177}
178
179/*
180** ALetter (MidLetter | MidNumLet ) ?
181**
182** prevclass cl
183**
184** Seen ALetter (MidLetter | MidNumLet), with the second character's status
185** not returned yet.
186*/
187
188static int seen_wb67_handler(unicode_wb_info_t i, uint8_t cl)
189{
190 int rc;
191 uint8_t prevclass;
192 size_t extra_cnt;
193
194 if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
195 {
196 ++i->wb4_extra_cnt;
197 return 0;
198 }
199
200 extra_cnt=i->wb4_extra_cnt;
201
202 /*
203 ** Reset the handler to the default, then check WB6
204 */
205
206 SET_HANDLER(wb1and2_done, NULL);
207
208 if (cl == UNICODE_WB_ALetter)
209 {
210 rc=result(i, 0); /* WB6 */
211 i->wb4_cnt=extra_cnt;
212
213 if (rc == 0)
214 rc=result(i, 0); /* WB7 */
215
216 i->prevclass=cl;
217
218 return rc;
219 }
220
221 prevclass=i->prevclass; /* This was the second character */
222
223 /*
224 ** Process the second character, starting with WB7
225 */
226
227 rc=wb67_done(i, UNICODE_WB_ALetter, prevclass);
228
229 i->prevclass=prevclass;
230 i->wb4_cnt=extra_cnt;
231
232 if (rc == 0)
233 rc=(*i->next_handler)(i, cl);
234 /* Process the current char now */
235
236 return rc;
237}
238
239/*
240** Seen ALetter (MidLetter | MidNumLet), with the second character's status
241** not returned yet, and now sot.
242*/
243
244static int seen_wb67_end_handler(unicode_wb_info_t i)
245{
246 int rc;
247 size_t extra_cnt=i->wb4_extra_cnt;
248
249 /*
250 ** Process the second character, starting with WB7.
251 */
252
253 rc=wb67_done(i, UNICODE_WB_ALetter, i->prevclass);
254 i->wb4_cnt=extra_cnt;
255 if (rc == 0)
256 rc=wb4(i);
257 return rc;
258}
259
260
261static int wb67_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
262{
263 if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_Numeric)
264 return result(i, 0); /* WB8 */
265
266 if (prevclass == UNICODE_WB_ALetter && cl == UNICODE_WB_Numeric)
267 return result(i, 0); /* WB9 */
268
269 if (prevclass == UNICODE_WB_Numeric && cl == UNICODE_WB_ALetter)
270 return result(i, 0); /* WB10 */
271
272
273 if (prevclass == UNICODE_WB_Numeric &&
274 (cl == UNICODE_WB_MidNum || cl == UNICODE_WB_MidNumLet))
275 {
276 i->wb4_extra_cnt=0;
277 SET_HANDLER(seen_wb1112_handler, seen_wb1112_end_handler);
278 return 0;
279 }
280
281 return wb1112_done(i, prevclass, cl);
282}
283
284/*
285** Numeric (MidNum | MidNumLet ) ?
286**
287** prevclass cl
288**
289** Seen Numeric (MidNum | MidNumLet), with the second character's status
290** not returned yet.
291*/
292
293static int seen_wb1112_handler(unicode_wb_info_t i, uint8_t cl)
294{
295 int rc;
296 uint8_t prevclass;
297 size_t extra_cnt;
298
299 if (cl == UNICODE_WB_Extend || cl == UNICODE_WB_Format)
300 {
301 ++i->wb4_extra_cnt;
302 return 0;
303 }
304
305 extra_cnt=i->wb4_extra_cnt;
306
307 /*
308 ** Reset the handler to the default, then check WB6
309 */
310
311 SET_HANDLER(wb1and2_done, NULL);
312
313 if (cl == UNICODE_WB_Numeric)
314 {
315 rc=result(i, 0); /* WB11 */
316 i->wb4_cnt=extra_cnt;
317
318 if (rc == 0)
319 rc=result(i, 0); /* WB12 */
320
321 i->prevclass=cl;
322
323 return rc;
324 }
325
326 prevclass=i->prevclass; /* This was the second character */
327
328 /*
329 ** Process the second character, starting with WB7
330 */
331
332 rc=wb1112_done(i, UNICODE_WB_Numeric, prevclass);
333
334 i->prevclass=prevclass;
335 i->wb4_cnt=extra_cnt;
336
337 if (rc == 0)
338 rc=(*i->next_handler)(i, cl);
339 /* Process the current char now */
340
341 return rc;
342}
343
344/*
345** Seen Numeric (MidNum | MidNumLet), with the second character's status
346** not returned yet, and now sot.
347*/
348
349static int seen_wb1112_end_handler(unicode_wb_info_t i)
350{
351 int rc;
352 size_t extra_cnt=i->wb4_extra_cnt;
353
354 /*
355 ** Process the second character, starting with WB11.
356 */
357
358 rc=wb1112_done(i, UNICODE_WB_Numeric, i->prevclass);
359 i->wb4_cnt=extra_cnt;
360 if (rc == 0)
361 rc=wb4(i);
362 return rc;
363}
364
365static int wb1112_done(unicode_wb_info_t i, uint8_t prevclass, uint8_t cl)
366{
367 if (prevclass == UNICODE_WB_Katakana &&
368 cl == UNICODE_WB_Katakana)
369 return result(i, 0); /* WB13 */
370
371 switch (prevclass) {
372 case UNICODE_WB_ALetter:
373 case UNICODE_WB_Numeric:
374 case UNICODE_WB_Katakana:
375 case UNICODE_WB_ExtendNumLet:
376 if (cl == UNICODE_WB_ExtendNumLet)
377 return result(i, 0); /* WB13a */
378 }
379
380 if (prevclass == UNICODE_WB_ExtendNumLet)
381 switch (cl) {
382 case UNICODE_WB_ALetter:
383 case UNICODE_WB_Numeric:
384 case UNICODE_WB_Katakana:
385 return result(i, 0); /* WB13b */
386 }
387
388 return result(i, 1); /* WB14 */
389}
390
391/* --------------------------------------------------------------------- */
392
393struct unicode_wbscan_info {
394 unicode_wb_info_t wb_handle;
395
396 int found;
397 size_t cnt;
398};
399
400static int unicode_wbscan_callback(int, void *);
401
402unicode_wbscan_info_t unicode_wbscan_init()
403{
404 unicode_wbscan_info_t i=calloc(1, sizeof(struct unicode_wbscan_info));
405
406 if (!i)
407 return NULL;
408
409 if ((i->wb_handle=unicode_wb_init(unicode_wbscan_callback, i)) == NULL)
410 {
411 free(i);
412 return NULL;
413 }
414
415 return i;
416}
417
418int unicode_wbscan_next(unicode_wbscan_info_t i, unicode_char ch)
419{
420 if (!i->found)
421 unicode_wb_next(i->wb_handle, ch);
422
423 return i->found;
424}
425
426size_t unicode_wbscan_end(unicode_wbscan_info_t i)
427{
428 size_t n;
429
430 unicode_wb_end(i->wb_handle);
431
432 n=i->cnt;
433 free(i);
434 return n;
435}
436
437static int unicode_wbscan_callback(int flag, void *arg)
438{
439 unicode_wbscan_info_t i=(unicode_wbscan_info_t)arg;
440
441 if (flag && i->cnt > 0)
442 i->found=1;
443
444 if (!i->found)
445 ++i->cnt;
446 return 0;
447}
448