Merge branch 'debian'
[hcoop/debian/courier-authlib.git] / libs / unicode / unicode_linebreak.c
CommitLineData
b0322a85
CE
1/*
2** Copyright 2011 Double Precision, Inc.
3** See COPYING for distribution information.
4**
5*/
6
7#include "unicode_config.h"
8#include "unicode.h"
9
10#include <unistd.h>
11#include <stdint.h>
12#include <stdlib.h>
13#include <string.h>
14#include <errno.h>
15
16#include "linebreaktab_internal.h"
17
18#include "linebreaktab.h"
19
20#define UNICODE_LB_SOT 0xFF
21
22struct unicode_lb_info {
23 int (*cb_func)(int, void *);
24 void *cb_arg;
25
26 int opts;
27
28 uint8_t savedclass;
29 size_t savedcmcnt;
30
31 uint8_t prevclass;
32 uint8_t prevclass_nsp;
33
34 int (*next_handler)(struct unicode_lb_info *, uint8_t);
35 int (*end_handler)(struct unicode_lb_info *);
36};
37
38
39/* http://www.unicode.org/reports/tr14/#Algorithm */
40
41static int next_def(unicode_lb_info_t, uint8_t);
42static int end_def(unicode_lb_info_t);
43
44static int next_lb25_seenophy(unicode_lb_info_t, uint8_t);
45static int end_lb25_seenophy(unicode_lb_info_t);
46
47static int next_lb25_seennu(unicode_lb_info_t, uint8_t);
48
49static int next_lb25_seennuclcp(unicode_lb_info_t, uint8_t);
50
51static void unicode_lb_reset(unicode_lb_info_t i)
52{
53 i->prevclass=i->prevclass_nsp=UNICODE_LB_SOT;
54 i->next_handler=next_def;
55 i->end_handler=end_def;
56}
57
58unicode_lb_info_t unicode_lb_init(int (*cb_func)(int, void *),
59 void *cb_arg)
60{
61 unicode_lb_info_t i=calloc(1, sizeof(struct unicode_lb_info));
62
63 i->cb_func=cb_func;
64 i->cb_arg=cb_arg;
65
66 unicode_lb_reset(i);
67 return i;
68}
69
70int unicode_lb_end(unicode_lb_info_t i)
71{
72 int rc=(*i->end_handler)(i);
73
74 free(i);
75 return rc;
76}
77
78void unicode_lb_set_opts(unicode_lb_info_t i, int opts)
79{
80 i->opts=opts;
81}
82
83/* Default end handler has nothing to do */
84
85static int end_def(unicode_lb_info_t i)
86{
87 /* LB3 N/A */
88 return 0;
89}
90#define RESULT(x) (*i->cb_func)((x), i->cb_arg)
91
92int unicode_lb_next_cnt(unicode_lb_info_t i,
93 const unicode_char *chars,
94 size_t cnt)
95{
96 while (cnt)
97 {
98 int rc=unicode_lb_next(i, *chars);
99
100 if (rc)
101 return rc;
102
103 ++chars;
104 --cnt;
105 }
106 return 0;
107}
108
109int unicode_lb_lookup(unicode_char ch)
110{
111 return unicode_tab_lookup(ch,
112 unicode_indextab,
113 sizeof(unicode_indextab)
114 / sizeof(unicode_indextab[0]),
115 unicode_rangetab,
116 unicode_classtab,
117 UNICODE_LB_AL /* XX, LB1 */);
118}
119
120int unicode_lb_next(unicode_lb_info_t i,
121 unicode_char ch)
122{
123 return (*i->next_handler)(i, (i->opts & UNICODE_LB_OPT_DASHWJ) &&
124 (ch == 0x2012 || ch == 0x2013)
125 ? UNICODE_LB_WJ:unicode_lb_lookup(ch));
126}
127
128static int next_def_nolb25(unicode_lb_info_t i,
129 uint8_t uclass,
130 int nolb25);
131
132/*
133** Default logic for next unicode char.
134*/
135static int next_def(unicode_lb_info_t i,
136 uint8_t uclass)
137{
138 return next_def_nolb25(i, uclass, 0);
139}
140
141static int next_def_nolb25(unicode_lb_info_t i,
142 uint8_t uclass,
143
144 /* Flag -- recursively invoked after discarding LB25 */
145 int nolb25)
146{
147
148 /* Retrieve the previous unicode character's linebreak class. */
149
150 uint8_t prevclass=i->prevclass;
151 uint8_t prevclass_nsp=i->prevclass_nsp;
152
153 /* Save this unicode char's linebreak class, for the next goaround */
154 i->prevclass=uclass;
155
156 if (uclass != UNICODE_LB_SP)
157 i->prevclass_nsp=uclass;
158
159 if (uclass == UNICODE_LB_NU)
160 i->next_handler=next_lb25_seennu; /* LB25 */
161
162 if (prevclass == UNICODE_LB_SOT)
163 {
164 if (uclass == UNICODE_LB_CM) /* LB9 */
165 i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
166
167 return RESULT(UNICODE_LB_NONE); /* LB2 */
168 }
169
170 if (prevclass == UNICODE_LB_CR && uclass == UNICODE_LB_LF)
171 return RESULT(UNICODE_LB_NONE); /* LB5 */
172
173 switch (prevclass) {
174 case UNICODE_LB_BK:
175 case UNICODE_LB_CR:
176 case UNICODE_LB_LF:
177 case UNICODE_LB_NL:
178
179 if (uclass == UNICODE_LB_CM)
180 {
181 i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
182 /* LB9 */
183 }
184
185 return RESULT(UNICODE_LB_MANDATORY); /* LB4, LB5 */
186
187 case UNICODE_LB_SP:
188 case UNICODE_LB_ZW:
189 if (uclass == UNICODE_LB_CM)
190 i->prevclass=i->prevclass_nsp=uclass=UNICODE_LB_AL;
191 /* LB10 */
192 break;
193 default:
194 break;
195 }
196
197 switch (uclass) {
198
199 /* LB6: */
200 case UNICODE_LB_BK:
201 case UNICODE_LB_CR:
202 case UNICODE_LB_LF:
203 case UNICODE_LB_NL:
204
205 /* LB7: */
206 case UNICODE_LB_SP:
207 case UNICODE_LB_ZW:
208
209 return RESULT(UNICODE_LB_NONE);
210 default:
211 break;
212 }
213
214 if (prevclass_nsp == UNICODE_LB_ZW)
215 return RESULT(UNICODE_LB_ALLOWED); /* LB8 */
216
217 if (uclass == UNICODE_LB_CM)
218 {
219 i->prevclass=prevclass;
220 i->prevclass_nsp=prevclass_nsp;
221 return RESULT(UNICODE_LB_NONE); /* LB9 */
222 }
223
224 if (prevclass == UNICODE_LB_WJ || uclass == UNICODE_LB_WJ)
225 return RESULT(UNICODE_LB_NONE); /* LB11 */
226
227 if (prevclass == UNICODE_LB_GL)
228 return RESULT(UNICODE_LB_NONE); /* LB12 */
229
230 if (uclass == UNICODE_LB_GL &&
231 prevclass != UNICODE_LB_SP &&
232 prevclass != UNICODE_LB_BA &&
233 prevclass != UNICODE_LB_HY)
234 return RESULT(UNICODE_LB_NONE); /* LB12a */
235
236
237 switch (uclass) {
238 case UNICODE_LB_SY:
239 if (i->opts & UNICODE_LB_OPT_SYBREAK)
240 {
241 if (prevclass == UNICODE_LB_SP)
242 return RESULT(UNICODE_LB_ALLOWED);
243 }
244
245 case UNICODE_LB_CL:
246 case UNICODE_LB_CP:
247 case UNICODE_LB_EX:
248 case UNICODE_LB_IS:
249 return RESULT(UNICODE_LB_NONE); /* LB13 */
250 default:
251 break;
252 }
253
254 if ((i->opts & UNICODE_LB_OPT_SYBREAK) && prevclass == UNICODE_LB_SY)
255 switch (uclass) {
256 case UNICODE_LB_EX:
257 case UNICODE_LB_AL:
258 case UNICODE_LB_ID:
259 return RESULT(UNICODE_LB_NONE);
260 }
261
262 if (prevclass_nsp == UNICODE_LB_OP)
263 return RESULT(UNICODE_LB_NONE); /* LB14 */
264
265 if (prevclass_nsp == UNICODE_LB_QU && uclass == UNICODE_LB_OP)
266 return RESULT(UNICODE_LB_NONE); /* LB15 */
267
268 if ((prevclass_nsp == UNICODE_LB_CL || prevclass_nsp == UNICODE_LB_CP)
269 && uclass == UNICODE_LB_NS)
270 return RESULT(UNICODE_LB_NONE); /* LB16 */
271
272 if (prevclass_nsp == UNICODE_LB_B2 && uclass == UNICODE_LB_B2)
273 return RESULT(UNICODE_LB_NONE); /* LB17 */
274
275 if (prevclass == UNICODE_LB_SP)
276 return RESULT(UNICODE_LB_ALLOWED); /* LB18 */
277
278 if (uclass == UNICODE_LB_QU || prevclass == UNICODE_LB_QU)
279 return RESULT(UNICODE_LB_NONE); /* LB19 */
280
281 if (uclass == UNICODE_LB_CB || prevclass == UNICODE_LB_CB)
282 return RESULT(UNICODE_LB_ALLOWED); /* LB20 */
283
284 /* LB21: */
285
286 switch (uclass) {
287 case UNICODE_LB_BA:
288 case UNICODE_LB_HY:
289 case UNICODE_LB_NS:
290 return RESULT(UNICODE_LB_NONE);
291 default:
292 break;
293 }
294
295 if (prevclass == UNICODE_LB_BB)
296 return RESULT(UNICODE_LB_NONE);
297
298 if (uclass == UNICODE_LB_IN)
299 switch (prevclass) {
300 case UNICODE_LB_AL:
301 case UNICODE_LB_ID:
302 case UNICODE_LB_IN:
303 case UNICODE_LB_NU:
304 return RESULT(UNICODE_LB_NONE); /* LB22 */
305 default:
306 break;
307 }
308
309
310 if (prevclass == UNICODE_LB_ID && uclass == UNICODE_LB_PO)
311 return RESULT(UNICODE_LB_NONE); /* LB23 */
312 if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_NU)
313 return RESULT(UNICODE_LB_NONE); /* LB23 */
314
315 if (prevclass == UNICODE_LB_NU && uclass == UNICODE_LB_AL)
316 return RESULT(UNICODE_LB_NONE); /* LB23 */
317
318
319 if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_ID)
320 return RESULT(UNICODE_LB_NONE); /* LB24 */
321 if (prevclass == UNICODE_LB_PR && uclass == UNICODE_LB_AL)
322 return RESULT(UNICODE_LB_NONE); /* LB24 */
323 if (prevclass == UNICODE_LB_PO && uclass == UNICODE_LB_AL)
324 return RESULT(UNICODE_LB_NONE); /* LB24 */
325
326 if ((i->opts & UNICODE_LB_OPT_PRBREAK) && uclass == UNICODE_LB_PR)
327 switch (prevclass) {
328 case UNICODE_LB_PR:
329 case UNICODE_LB_AL:
330 case UNICODE_LB_ID:
331 return RESULT(UNICODE_LB_NONE);
332 }
333
334 if (!nolb25 &&
335 (prevclass == UNICODE_LB_PR || prevclass == UNICODE_LB_PO))
336 {
337 if (uclass == UNICODE_LB_NU)
338 return RESULT(UNICODE_LB_NONE); /* LB25 */
339
340 if (uclass == UNICODE_LB_OP || uclass == UNICODE_LB_HY)
341 {
342 i->prevclass=prevclass;
343 i->prevclass_nsp=prevclass_nsp;
344
345 i->savedclass=uclass;
346 i->savedcmcnt=0;
347 i->next_handler=next_lb25_seenophy;
348 i->end_handler=end_lb25_seenophy;
349 return 0;
350 }
351 }
352
353 if ((prevclass == UNICODE_LB_OP || prevclass == UNICODE_LB_HY) &&
354 uclass == UNICODE_LB_NU)
355 return RESULT(UNICODE_LB_NONE); /* LB25 */
356
357 /*****/
358
359 if (prevclass == UNICODE_LB_JL)
360 switch (uclass) {
361 case UNICODE_LB_JL:
362 case UNICODE_LB_JV:
363 case UNICODE_LB_H2:
364 case UNICODE_LB_H3:
365 return RESULT(UNICODE_LB_NONE); /* LB26 */
366 default:
367 break;
368 }
369
370 if ((prevclass == UNICODE_LB_JV ||
371 prevclass == UNICODE_LB_H2) &&
372 (uclass == UNICODE_LB_JV ||
373 uclass == UNICODE_LB_JT))
374 return RESULT(UNICODE_LB_NONE); /* LB26 */
375
376 if ((prevclass == UNICODE_LB_JT ||
377 prevclass == UNICODE_LB_H3) &&
378 uclass == UNICODE_LB_JT)
379 return RESULT(UNICODE_LB_NONE); /* LB26 */
380
381
382 switch (prevclass) {
383 case UNICODE_LB_JL:
384 case UNICODE_LB_JV:
385 case UNICODE_LB_JT:
386 case UNICODE_LB_H2:
387 case UNICODE_LB_H3:
388 if (uclass == UNICODE_LB_IN || uclass == UNICODE_LB_PO)
389 return RESULT(UNICODE_LB_NONE); /* LB27 */
390 default:
391 break;
392 }
393
394 switch (uclass) {
395 case UNICODE_LB_JL:
396 case UNICODE_LB_JV:
397 case UNICODE_LB_JT:
398 case UNICODE_LB_H2:
399 case UNICODE_LB_H3:
400 if (prevclass == UNICODE_LB_PR)
401 return RESULT(UNICODE_LB_NONE); /* LB27 */
402 default:
403 break;
404 }
405
406 if (prevclass == UNICODE_LB_AL && uclass == UNICODE_LB_AL)
407 return RESULT(UNICODE_LB_NONE); /* LB28 */
408
409 if (prevclass == UNICODE_LB_IS && uclass == UNICODE_LB_AL)
410 return RESULT(UNICODE_LB_NONE); /* LB29 */
411
412 if ((prevclass == UNICODE_LB_AL || prevclass == UNICODE_LB_NU) &&
413 uclass == UNICODE_LB_OP)
414 return RESULT(UNICODE_LB_NONE); /* LB30 */
415
416 if ((uclass == UNICODE_LB_AL || uclass == UNICODE_LB_NU) &&
417 prevclass == UNICODE_LB_CP)
418 return RESULT(UNICODE_LB_NONE); /* LB30 */
419
420 return RESULT(UNICODE_LB_ALLOWED); /* LB31 */
421}
422
423/*
424** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
425** character, but NU did not follow. Backtrack.
426*/
427
428static int unwind_lb25_seenophy(unicode_lb_info_t i)
429{
430 int rc;
431
432 /*uint8_t class=i->savedclass;*/
433 int nolb25_flag=1;
434
435 i->next_handler=next_def;
436 i->end_handler=end_def;
437
438 do
439 {
440 rc=next_def_nolb25(i, i->savedclass, nolb25_flag);
441
442 if (rc)
443 return rc;
444
445 /*class=UNICODE_LB_CM;*/
446 nolb25_flag=0;
447 } while (i->savedcmcnt--);
448 return 0;
449}
450
451/*
452** Seen (PR|PO)(OP|HY), without returning the linebreak property for the second
453** character. If there's now a NU, we found the modified LB25 regexp.
454*/
455
456static int next_lb25_seenophy(unicode_lb_info_t i,
457 uint8_t uclass)
458{
459 int rc;
460
461 if (uclass == UNICODE_LB_CM)
462 {
463 ++i->savedcmcnt; /* Keep track of CMs, and try again */
464 return 0;
465 }
466
467 if (uclass != UNICODE_LB_NU)
468 {
469 rc=unwind_lb25_seenophy(i);
470
471 if (rc)
472 return rc;
473
474 return next_def_nolb25(i, uclass, 0);
475 }
476
477 do
478 {
479 rc=RESULT(UNICODE_LB_NONE); /* (OP|HY) feedback */
480
481 if (rc)
482 return rc;
483 } while (i->savedcmcnt--);
484
485 i->next_handler=next_lb25_seennu;
486 i->end_handler=end_def;
487 i->prevclass=i->prevclass_nsp=uclass;
488 return RESULT(UNICODE_LB_NONE);
489}
490
491/*
492** Seen (PR|PO)(OP|HY), and now The End. Unwind, and give up.
493*/
494
495static int end_lb25_seenophy(unicode_lb_info_t i)
496{
497 int rc=unwind_lb25_seenophy(i);
498
499 if (rc == 0)
500 rc=end_def(i);
501 return rc;
502}
503
504/*
505** Seen an NU, modified LB25 regexp.
506*/
507static int next_lb25_seennu(unicode_lb_info_t i, uint8_t uclass)
508{
509 if (uclass == UNICODE_LB_NU || uclass == UNICODE_LB_SY ||
510 uclass == UNICODE_LB_IS)
511 {
512 i->prevclass=i->prevclass_nsp=uclass;
513 return RESULT(UNICODE_LB_NONE);
514 }
515
516 if (uclass == UNICODE_LB_CM)
517 return RESULT(UNICODE_LB_NONE); /* LB9 */
518
519 if (uclass == UNICODE_LB_CL || uclass == UNICODE_LB_CP)
520 {
521 i->prevclass=i->prevclass_nsp=uclass;
522 i->next_handler=next_lb25_seennuclcp;
523 i->end_handler=end_def;
524 return RESULT(UNICODE_LB_NONE);
525 }
526
527 i->next_handler=next_def;
528 i->end_handler=end_def;
529
530 if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
531 {
532 i->prevclass=i->prevclass_nsp=uclass;
533 return RESULT(UNICODE_LB_NONE);
534 }
535
536 return next_def(i, uclass); /* Not a prefix, process normally */
537}
538
539/*
540** Seen CL|CP, in the modified LB25 regexp.
541*/
542static int next_lb25_seennuclcp(unicode_lb_info_t i, uint8_t uclass)
543{
544 if (uclass == UNICODE_LB_CM)
545 return RESULT(UNICODE_LB_NONE); /* LB9 */
546
547 i->next_handler=next_def;
548 i->end_handler=end_def;
549
550 if (uclass == UNICODE_LB_PR || uclass == UNICODE_LB_PO)
551 {
552 i->prevclass=i->prevclass_nsp=uclass;
553
554 return RESULT(UNICODE_LB_NONE);
555 }
556
557 return next_def(i, uclass);
558}
559
560/******************/
561
562struct unicode_lbc_info {
563 unicode_lb_info_t handle;
564
565 struct unicode_buf buf;
566
567 size_t buf_ptr;
568
569 int (*cb_func)(int, unicode_char, void *);
570 void *cb_arg;
571};
572
573static int unicode_lbc_callback(int value, void *ptr)
574{
575 unicode_lbc_info_t h=(unicode_lbc_info_t)ptr;
576
577 if (h->buf_ptr >= unicode_buf_len(&h->buf))
578 {
579 errno=EINVAL;
580 return -1; /* Shouldn't happen */
581 }
582
583 return (*h->cb_func)(value, unicode_buf_ptr(&h->buf)[h->buf_ptr++],
584 h->cb_arg);
585}
586
587unicode_lbc_info_t unicode_lbc_init(int (*cb_func)(int, unicode_char, void *),
588 void *cb_arg)
589{
590 unicode_lbc_info_t h=
591 (unicode_lbc_info_t)calloc(1, sizeof(struct unicode_lbc_info));
592
593 if (!h)
594 return NULL;
595
596 h->cb_func=cb_func;
597 h->cb_arg=cb_arg;
598
599 if ((h->handle=unicode_lb_init(unicode_lbc_callback, h)) == NULL)
600 {
601 free(h);
602 return NULL;
603 }
604 unicode_buf_init(&h->buf, (size_t)-1);
605 return h;
606}
607
608void unicode_lbc_set_opts(unicode_lbc_info_t i, int opts)
609{
610 unicode_lb_set_opts(i->handle, opts);
611}
612
613int unicode_lbc_next(unicode_lbc_info_t i, unicode_char ch)
614{
615 if (i->buf_ptr >= unicode_buf_len(&i->buf))
616 {
617 i->buf_ptr=0;
618 unicode_buf_clear(&i->buf);
619 }
620
621 unicode_buf_append(&i->buf, &ch, 1);
622 return unicode_lb_next(i->handle, ch);
623}
624
625int unicode_lbc_end(unicode_lbc_info_t i)
626{
627 int rc=unicode_lb_end(i->handle);
628
629 unicode_buf_deinit(&i->buf);
630 free(i);
631 return rc;
632}