Imported Upstream version 0.66.1
[hcoop/debian/courier-authlib.git] / libs / unicode / unicode_linebreak.c
1 /*
2 ** Copyright 2011 Double Precision, Inc.
3 ** See COPYING for distribution information.
4 **
5 */
6
7 #include "unicode_config.h"
8 #include "unicode.h"
9
10 #include <unistd.h>
11 #include <stdint.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <errno.h>
15
16 #include "linebreaktab_internal.h"
17
18 #include "linebreaktab.h"
19
20 #define UNICODE_LB_SOT 0xFF
21
22 struct unicode_lb_info {
23 int (*cb_func)(int, void *);
24 void *cb_arg;
25
26 int opts;
27
28 uint8_t savedclass;
29 size_t savedcmcnt;
30
31 uint8_t prevclass;
32 uint8_t prevclass_nsp;
33
34 int (*next_handler)(struct unicode_lb_info *, uint8_t);
35 int (*end_handler)(struct unicode_lb_info *);
36 };
37
38
39 /* http://www.unicode.org/reports/tr14/#Algorithm */
40
41 static int next_def(unicode_lb_info_t, uint8_t);
42 static int end_def(unicode_lb_info_t);
43
44 static int next_lb25_seenophy(unicode_lb_info_t, uint8_t);
45 static int end_lb25_seenophy(unicode_lb_info_t);
46
47 static int next_lb25_seennu(unicode_lb_info_t, uint8_t);
48
49 static int next_lb25_seennuclcp(unicode_lb_info_t, uint8_t);
50
51 static void unicode_lb_reset(unicode_lb_info_t i)
52 {
53 i->prevclass=i->prevclass_nsp=UNICODE_LB_SOT;
54 i->next_handler=next_def;
55 i->end_handler=end_def;
56 }
57
58 unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
59 void *cb_arg)
60 {
61 unicode_lb_info_t i=calloc(1, sizeof(struct unicode_lb_info));
62
63 i->cb_func=cb_func;
64 i->cb_arg=cb_arg;
65
66 unicode_lb_reset(i);
67 return i;
68 }
69
70 int unicode_lb_end(unicode_lb_info_t i)
71 {
72 int rc=(*i->end_handler)(i);
73
74 free(i);
75 return rc;
76 }
77
78 void unicode_lb_set_opts(unicode_lb_info_t i, int opts)
79 {
80 i->opts=opts;
81 }
82
83 /* Default end handler has nothing to do */
84
85 static int end_def(unicode_lb_info_t i)
86 {
87 /* LB3 N/A */
88 return 0;
89 }
90 #define RESULT(x) (*i->cb_func)((x), i->cb_arg)
91
92 int unicode_lb_next_cnt(unicode_lb_info_t i,
93 const unicode_char *chars,
94 size_t cnt)
95 {
96 while (cnt)
97 {
98 int rc=unicode_lb_next(i, *chars);
99
100 if (rc)
101 return rc;
102
103 ++chars;
104 --cnt;
105 }
106 return 0;
107 }
108
109 int unicode_lb_lookup(unicode_char ch)
110 {
111 return unicode_tab_lookup(ch,
112 unicode_indextab,
113 sizeof(unicode_indextab)
114 / sizeof(unicode_indextab[0]),
115 unicode_rangetab,
116 unicode_classtab,
117 UNICODE_LB_AL /* XX, LB1 */);
118 }
119
120 int unicode_lb_next(unicode_lb_info_t i,
121 unicode_char ch)
122 {
123 return (*i->next_handler)(i, (i->opts & UNICODE_LB_OPT_DASHWJ) &&
124 (ch == 0x2012 || ch == 0x2013)
125 ? UNICODE_LB_WJ:unicode_lb_lookup(ch));
126 }
127
128 static int next_def_nolb25(unicode_lb_info_t i,
129 uint8_t uclass,
130 int nolb25);
131
132 /*
133 ** Default logic for next unicode char.
134 */
135 static int next_def(unicode_lb_info_t i,
136 uint8_t uclass)
137 {
138 return next_def_nolb25(i, uclass, 0);
139 }
140
141 static int next_def_nolb25(unicode_lb_info_t i,
142 uint8_t uclass,
143
144 /* Flag -- recursively invoked after discarding LB25 */
145 int nolb25)
146 {
147
148 /* Retrieve the previous unicode character's linebreak class. */
149
150 uint8_t prevclass=i->prevclass;
151 uint8_t prevclass_nsp=i->prevclass_nsp;
152
153 /* Save this unicode char's linebreak class, for the next goaround */
154 i->prevclass=uclass;
155
156 if (uclass != UNICODE_LB_SP)
157 i->prevclass_nsp=uclass;
158
159 if (uclass == UNICODE_LB_NU)
160 i->next_handler=next_lb25_seennu; /* LB25 */
161
162 if (prevclass == UNICODE_LB_SOT)
163 {
164 if (uclass == UNICODE_LB_CM) /* LB9 */
165 i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
166
167 return RESULT(UNICODE_LB_NONE); /* LB2 */
168 }
169
170 if (prevclass == UNICODE_LB_CR && uclass == UNICODE_LB_LF)
171 return RESULT(UNICODE_LB_NONE); /* LB5 */
172
173 switch (prevclass) {
174 case UNICODE_LB_BK:
175 case UNICODE_LB_CR:
176 case UNICODE_LB_LF:
177 case UNICODE_LB_NL:
178
179 if (uclass == UNICODE_LB_CM)
180 {
181 i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
182 /* LB9 */
183 }
184
185 return RESULT(UNICODE_LB_MANDATORY); /* LB4, LB5 */
186
187 case UNICODE_LB_SP:
188 case UNICODE_LB_ZW:
189 if (uclass == UNICODE_LB_CM)
190 i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
191 /* LB10 */
192 break;
193 default:
194 break;
195 }
196
197 switch (uclass) {
198
199 /* LB6: */
200 case UNICODE_LB_BK:
201 case UNICODE_LB_CR:
202 case UNICODE_LB_LF:
203 case UNICODE_LB_NL:
204
205 /* LB7: */
206 case UNICODE_LB_SP:
207 case UNICODE_LB_ZW:
208
209 return RESULT(UNICODE_LB_NONE);
210 default:
211 break;
212 }
213
214 if (prevclass_nsp == UNICODE_LB_ZW)
215 return RESULT(UNICODE_LB_ALLOWED); /* LB8 */
216
217 if (uclass == UNICODE_LB_CM)
218 {
219 i->prevclass=prevclass;
220 i->prevclass_nsp=prevclass_nsp;
221 return RESULT(UNICODE_LB_NONE); /* LB9 */
222 }
223
224 if (prevclass == UNICODE_LB_WJ || uclass == UNICODE_LB_WJ)
225 return RESULT(UNICODE_LB_NONE); /* LB11 */
226
227 if (prevclass == UNICODE_LB_GL)
228 return RESULT(UNICODE_LB_NONE); /* LB12 */
229
230 if (uclass == UNICODE_LB_GL &&
231 prevclass != UNICODE_LB_SP &&
232 prevclass != UNICODE_LB_BA &&
233 prevclass != UNICODE_LB_HY)
234 return RESULT(UNICODE_LB_NONE); /* LB12a */
235
236
237 switch (uclass) {
238 case UNICODE_LB_SY:
239 if (i->opts & UNICODE_LB_OPT_SYBREAK)
240 {
241 if (prevclass == UNICODE_LB_SP)
242 return RESULT(UNICODE_LB_ALLOWED);
243 }
244
245 case UNICODE_LB_CL:
246 case UNICODE_LB_CP:
247 case UNICODE_LB_EX:
248 case UNICODE_LB_IS:
249 return RESULT(UNICODE_LB_NONE); /* LB13 */
250 default:
251 break;
252 }
253
254 if ((i->opts & UNICODE_LB_OPT_SYBREAK) && prevclass == UNICODE_LB_SY)
255 switch (uclass) {
256 case UNICODE_LB_EX:
257 case UNICODE_LB_AL:
258 case UNICODE_LB_ID:
259 return RESULT(UNICODE_LB_NONE);
260 }
261
262 if (prevclass_nsp == UNICODE_LB_OP)
263 return RESULT(UNICODE_LB_NONE); /* LB14 */
264
265 if (prevclass_nsp == UNICODE_LB_QU && uclass == UNICODE_LB_OP)
266 return RESULT(UNICODE_LB_NONE); /* LB15 */
267
268 if ((prevclass_nsp == UNICODE_LB_CL || prevclass_nsp == UNICODE_LB_CP)
269 && uclass == UNICODE_LB_NS)
270 return RESULT(UNICODE_LB_NONE); /* LB16 */
271
272 if (prevclass_nsp == UNICODE_LB_B2 && uclass == UNICODE_LB_B2)
273 return RESULT(UNICODE_LB_NONE); /* LB17 */
274
275 if (prevclass == UNICODE_LB_SP)
276 return RESULT(UNICODE_LB_ALLOWED); /* LB18 */
277
278 if (uclass == UNICODE_LB_QU || prevclass == UNICODE_LB_QU)
279 return RESULT(UNICODE_LB_NONE); /* LB19 */
280
281 if (uclass == UNICODE_LB_CB || prevclass == UNICODE_LB_CB)
282 return RESULT(UNICODE_LB_ALLOWED); /* LB20 */
283
284 /* LB21: */
285
286 switch (uclass) {
287 case UNICODE_LB_BA:
288 case UNICODE_LB_HY:
289 case UNICODE_LB_NS:
290 return RESULT(UNICODE_LB_NONE);
291 default:
292 break;
293 }
294
295 if (prevclass == UNICODE_LB_BB)
296 return RESULT(UNICODE_LB_NONE);
297
298 if (uclass == UNICODE_LB_IN)
299 switch (prevclass) {
300 case UNICODE_LB_AL:
301 case UNICODE_LB_ID:
302 case UNICODE_LB_IN:
303 case UNICODE_LB_NU:
304 return RESULT(UNICODE_LB_NONE); /* LB22 */
305 default:
306 break;
307 }
308
309
310 if (prevclass == UNICODE_LB_ID && uclass == UNICODE_LB_PO)
311 return RESULT(UNICODE_LB_NONE); /* LB23 */
312 if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_NU)
313 return RESULT(UNICODE_LB_NONE); /* LB23 */
314
315 if (prevclass == UNICODE_LB_NU && uclass == UNICODE_LB_AL)
316 return RESULT(UNICODE_LB_NONE); /* LB23 */
317
318
319 if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_ID)
320 return RESULT(UNICODE_LB_NONE); /* LB24 */
321 if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_AL)
322 return RESULT(UNICODE_LB_NONE); /* LB24 */
323 if (prevclass == UNICODE_LB_PO && uclass == UNICODE_LB_AL)
324 return RESULT(UNICODE_LB_NONE); /* LB24 */
325
326 if ((i->opts & UNICODE_LB_OPT_PRBREAK) && uclass == UNICODE_LB_PR)
327 switch (prevclass) {
328 case UNICODE_LB_PR:
329 case UNICODE_LB_AL:
330 case UNICODE_LB_ID:
331 return RESULT(UNICODE_LB_NONE);
332 }
333
334 if (!nolb25 &&
335 (prevclass == UNICODE_LB_PR || prevclass == UNICODE_LB_PO))
336 {
337 if (uclass == UNICODE_LB_NU)
338 return RESULT(UNICODE_LB_NONE); /* LB25 */
339
340 if (uclass == UNICODE_LB_OP || uclass == UNICODE_LB_HY)
341 {
342 i->prevclass=prevclass;
343 i->prevclass_nsp=prevclass_nsp;
344
345 i->savedclass=uclass;
346 i->savedcmcnt=0;
347 i->next_handler=next_lb25_seenophy;
348 i->end_handler=end_lb25_seenophy;
349 return 0;
350 }
351 }
352
353 if ((prevclass == UNICODE_LB_OP || prevclass == UNICODE_LB_HY) &&
354 uclass == UNICODE_LB_NU)
355 return RESULT(UNICODE_LB_NONE); /* LB25 */
356
357 /*****/
358
359 if (prevclass == UNICODE_LB_JL)
360 switch (uclass) {
361 case UNICODE_LB_JL:
362 case UNICODE_LB_JV:
363 case UNICODE_LB_H2:
364 case UNICODE_LB_H3:
365 return RESULT(UNICODE_LB_NONE); /* LB26 */
366 default:
367 break;
368 }
369
370 if ((prevclass == UNICODE_LB_JV ||
371 prevclass == UNICODE_LB_H2) &&
372 (uclass == UNICODE_LB_JV ||
373 uclass == UNICODE_LB_JT))
374 return RESULT(UNICODE_LB_NONE); /* LB26 */
375
376 if ((prevclass == UNICODE_LB_JT ||
377 prevclass == UNICODE_LB_H3) &&
378 uclass == UNICODE_LB_JT)
379 return RESULT(UNICODE_LB_NONE); /* LB26 */
380
381
382 switch (prevclass) {
383 case UNICODE_LB_JL:
384 case UNICODE_LB_JV:
385 case UNICODE_LB_JT:
386 case UNICODE_LB_H2:
387 case UNICODE_LB_H3:
388 if (uclass == UNICODE_LB_IN || uclass == UNICODE_LB_PO)
389 return RESULT(UNICODE_LB_NONE); /* LB27 */
390 default:
391 break;
392 }
393
394 switch (uclass) {
395 case UNICODE_LB_JL:
396 case UNICODE_LB_JV:
397 case UNICODE_LB_JT:
398 case UNICODE_LB_H2:
399 case UNICODE_LB_H3:
400 if (prevclass == UNICODE_LB_PR)
401 return RESULT(UNICODE_LB_NONE); /* LB27 */
402 default:
403 break;
404 }
405
406 if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_AL)
407 return RESULT(UNICODE_LB_NONE); /* LB28 */
408
409 if (prevclass == UNICODE_LB_IS && uclass == UNICODE_LB_AL)
410 return RESULT(UNICODE_LB_NONE); /* LB29 */
411
412 if ((prevclass == UNICODE_LB_AL || prevclass == UNICODE_LB_NU) &&
413 uclass == UNICODE_LB_OP)
414 return RESULT(UNICODE_LB_NONE); /* LB30 */
415
416 if ((uclass == UNICODE_LB_AL || uclass == UNICODE_LB_NU) &&
417 prevclass == UNICODE_LB_CP)
418 return RESULT(UNICODE_LB_NONE); /* LB30 */
419
420 return RESULT(UNICODE_LB_ALLOWED); /* LB31 */
421 }
422
423 /*
424 ** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
425 ** character, but NU did not follow. Backtrack.
426 */
427
428 static int unwind_lb25_seenophy(unicode_lb_info_t i)
429 {
430 int rc;
431
432 /*uint8_t class=i->savedclass;*/
433 int nolb25_flag=1;
434
435 i->next_handler=next_def;
436 i->end_handler=end_def;
437
438 do
439 {
440 rc=next_def_nolb25(i, i->savedclass, nolb25_flag);
441
442 if (rc)
443 return rc;
444
445 /*class=UNICODE_LB_CM;*/
446 nolb25_flag=0;
447 } while (i->savedcmcnt--);
448 return 0;
449 }
450
451 /*
452 ** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
453 ** character. If there's now a NU, we found the modified LB25 regexp.
454 */
455
456 static int next_lb25_seenophy(unicode_lb_info_t i,
457 uint8_t uclass)
458 {
459 int rc;
460
461 if (uclass == UNICODE_LB_CM)
462 {
463 ++i->savedcmcnt; /* Keep track of CMs, and try again */
464 return 0;
465 }
466
467 if (uclass != UNICODE_LB_NU)
468 {
469 rc=unwind_lb25_seenophy(i);
470
471 if (rc)
472 return rc;
473
474 return next_def_nolb25(i, uclass, 0);
475 }
476
477 do
478 {
479 rc=RESULT(UNICODE_LB_NONE); /* (OP|HY) feedback */
480
481 if (rc)
482 return rc;
483 } while (i->savedcmcnt--);
484
485 i->next_handler=next_lb25_seennu;
486 i->end_handler=end_def;
487 i->prevclass=i->prevclass_nsp=uclass;
488 return RESULT(UNICODE_LB_NONE);
489 }
490
491 /*
492 ** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up.
493 */
494
495 static int end_lb25_seenophy(unicode_lb_info_t i)
496 {
497 int rc=unwind_lb25_seenophy(i);
498
499 if (rc == 0)
500 rc=end_def(i);
501 return rc;
502 }
503
504 /*
505 ** Seen an NU, modified LB25 regexp.
506 */
507 static int next_lb25_seennu(unicode_lb_info_t i, uint8_t uclass)
508 {
509 if (uclass == UNICODE_LB_NU || uclass == UNICODE_LB_SY ||
510 uclass == UNICODE_LB_IS)
511 {
512 i->prevclass=i->prevclass_nsp=uclass;
513 return RESULT(UNICODE_LB_NONE);
514 }
515
516 if (uclass == UNICODE_LB_CM)
517 return RESULT(UNICODE_LB_NONE); /* LB9 */
518
519 if (uclass == UNICODE_LB_CL || uclass == UNICODE_LB_CP)
520 {
521 i->prevclass=i->prevclass_nsp=uclass;
522 i->next_handler=next_lb25_seennuclcp;
523 i->end_handler=end_def;
524 return RESULT(UNICODE_LB_NONE);
525 }
526
527 i->next_handler=next_def;
528 i->end_handler=end_def;
529
530 if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
531 {
532 i->prevclass=i->prevclass_nsp=uclass;
533 return RESULT(UNICODE_LB_NONE);
534 }
535
536 return next_def(i, uclass); /* Not a prefix, process normally */
537 }
538
539 /*
540 ** Seen CL|CP, in the modified LB25 regexp.
541 */
542 static int next_lb25_seennuclcp(unicode_lb_info_t i, uint8_t uclass)
543 {
544 if (uclass == UNICODE_LB_CM)
545 return RESULT(UNICODE_LB_NONE); /* LB9 */
546
547 i->next_handler=next_def;
548 i->end_handler=end_def;
549
550 if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
551 {
552 i->prevclass=i->prevclass_nsp=uclass;
553
554 return RESULT(UNICODE_LB_NONE);
555 }
556
557 return next_def(i, uclass);
558 }
559
560 /******************/
561
562 struct unicode_lbc_info {
563 unicode_lb_info_t handle;
564
565 struct unicode_buf buf;
566
567 size_t buf_ptr;
568
569 int (*cb_func)(int, unicode_char, void *);
570 void *cb_arg;
571 };
572
573 static int unicode_lbc_callback(int value, void *ptr)
574 {
575 unicode_lbc_info_t h=(unicode_lbc_info_t)ptr;
576
577 if (h->buf_ptr >= unicode_buf_len(&h->buf))
578 {
579 errno=EINVAL;
580 return -1; /* Shouldn't happen */
581 }
582
583 return (*h->cb_func)(value, unicode_buf_ptr(&h->buf)[h->buf_ptr++],
584 h->cb_arg);
585 }
586
587 unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, void *),
588 void *cb_arg)
589 {
590 unicode_lbc_info_t h=
591 (unicode_lbc_info_t)calloc(1, sizeof(struct unicode_lbc_info));
592
593 if (!h)
594 return NULL;
595
596 h->cb_func=cb_func;
597 h->cb_arg=cb_arg;
598
599 if ((h->handle=unicode_lb_init(unicode_lbc_callback, h)) == NULL)
600 {
601 free(h);
602 return NULL;
603 }
604 unicode_buf_init(&h->buf, (size_t)-1);
605 return h;
606 }
607
608 void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts)
609 {
610 unicode_lb_set_opts(i->handle, opts);
611 }
612
613 int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch)
614 {
615 if (i->buf_ptr >= unicode_buf_len(&i->buf))
616 {
617 i->buf_ptr=0;
618 unicode_buf_clear(&i->buf);
619 }
620
621 unicode_buf_append(&i->buf, &ch, 1);
622 return unicode_lb_next(i->handle, ch);
623 }
624
625 int unicode_lbc_end(unicode_lbc_info_t i)
626 {
627 int rc=unicode_lb_end(i->handle);
628
629 unicode_buf_deinit(&i->buf);
630 free(i);
631 return rc;
632 }