Update Gnulib to v0.0-7509-g98a2286.
[bpt/guile.git] / lib / regex_internal.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License along
17 with this program; if not, see <http://www.gnu.org/licenses/>. */
18
19 static void re_string_construct_common (const char *str, Idx len,
20 re_string_t *pstr,
21 RE_TRANSLATE_TYPE trans, bool icase,
22 const re_dfa_t *dfa) internal_function;
23 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
24 const re_node_set *nodes,
25 re_hashval_t hash) internal_function;
26 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
27 const re_node_set *nodes,
28 unsigned int context,
29 re_hashval_t hash) internal_function;
30 \f
31 /* Functions for string operation. */
32
33 /* This function allocate the buffers. It is necessary to call
34 re_string_reconstruct before using the object. */
35
36 static reg_errcode_t
37 internal_function __attribute_warn_unused_result__
38 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
39 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
40 {
41 reg_errcode_t ret;
42 Idx init_buf_len;
43
44 /* Ensure at least one character fits into the buffers. */
45 if (init_len < dfa->mb_cur_max)
46 init_len = dfa->mb_cur_max;
47 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
48 re_string_construct_common (str, len, pstr, trans, icase, dfa);
49
50 ret = re_string_realloc_buffers (pstr, init_buf_len);
51 if (BE (ret != REG_NOERROR, 0))
52 return ret;
53
54 pstr->word_char = dfa->word_char;
55 pstr->word_ops_used = dfa->word_ops_used;
56 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
57 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
58 pstr->valid_raw_len = pstr->valid_len;
59 return REG_NOERROR;
60 }
61
62 /* This function allocate the buffers, and initialize them. */
63
64 static reg_errcode_t
65 internal_function __attribute_warn_unused_result__
66 re_string_construct (re_string_t *pstr, const char *str, Idx len,
67 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
68 {
69 reg_errcode_t ret;
70 memset (pstr, '\0', sizeof (re_string_t));
71 re_string_construct_common (str, len, pstr, trans, icase, dfa);
72
73 if (len > 0)
74 {
75 ret = re_string_realloc_buffers (pstr, len + 1);
76 if (BE (ret != REG_NOERROR, 0))
77 return ret;
78 }
79 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
80
81 if (icase)
82 {
83 #ifdef RE_ENABLE_I18N
84 if (dfa->mb_cur_max > 1)
85 {
86 while (1)
87 {
88 ret = build_wcs_upper_buffer (pstr);
89 if (BE (ret != REG_NOERROR, 0))
90 return ret;
91 if (pstr->valid_raw_len >= len)
92 break;
93 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
94 break;
95 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
96 if (BE (ret != REG_NOERROR, 0))
97 return ret;
98 }
99 }
100 else
101 #endif /* RE_ENABLE_I18N */
102 build_upper_buffer (pstr);
103 }
104 else
105 {
106 #ifdef RE_ENABLE_I18N
107 if (dfa->mb_cur_max > 1)
108 build_wcs_buffer (pstr);
109 else
110 #endif /* RE_ENABLE_I18N */
111 {
112 if (trans != NULL)
113 re_string_translate_buffer (pstr);
114 else
115 {
116 pstr->valid_len = pstr->bufs_len;
117 pstr->valid_raw_len = pstr->bufs_len;
118 }
119 }
120 }
121
122 return REG_NOERROR;
123 }
124
125 /* Helper functions for re_string_allocate, and re_string_construct. */
126
127 static reg_errcode_t
128 internal_function __attribute_warn_unused_result__
129 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
130 {
131 #ifdef RE_ENABLE_I18N
132 if (pstr->mb_cur_max > 1)
133 {
134 wint_t *new_wcs;
135
136 /* Avoid overflow in realloc. */
137 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
138 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
139 return REG_ESPACE;
140
141 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
142 if (BE (new_wcs == NULL, 0))
143 return REG_ESPACE;
144 pstr->wcs = new_wcs;
145 if (pstr->offsets != NULL)
146 {
147 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
148 if (BE (new_offsets == NULL, 0))
149 return REG_ESPACE;
150 pstr->offsets = new_offsets;
151 }
152 }
153 #endif /* RE_ENABLE_I18N */
154 if (pstr->mbs_allocated)
155 {
156 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
157 new_buf_len);
158 if (BE (new_mbs == NULL, 0))
159 return REG_ESPACE;
160 pstr->mbs = new_mbs;
161 }
162 pstr->bufs_len = new_buf_len;
163 return REG_NOERROR;
164 }
165
166
167 static void
168 internal_function
169 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
170 RE_TRANSLATE_TYPE trans, bool icase,
171 const re_dfa_t *dfa)
172 {
173 pstr->raw_mbs = (const unsigned char *) str;
174 pstr->len = len;
175 pstr->raw_len = len;
176 pstr->trans = trans;
177 pstr->icase = icase;
178 pstr->mbs_allocated = (trans != NULL || icase);
179 pstr->mb_cur_max = dfa->mb_cur_max;
180 pstr->is_utf8 = dfa->is_utf8;
181 pstr->map_notascii = dfa->map_notascii;
182 pstr->stop = pstr->len;
183 pstr->raw_stop = pstr->stop;
184 }
185
186 #ifdef RE_ENABLE_I18N
187
188 /* Build wide character buffer PSTR->WCS.
189 If the byte sequence of the string are:
190 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
191 Then wide character buffer will be:
192 <wc1> , WEOF , <wc2> , WEOF , <wc3>
193 We use WEOF for padding, they indicate that the position isn't
194 a first byte of a multibyte character.
195
196 Note that this function assumes PSTR->VALID_LEN elements are already
197 built and starts from PSTR->VALID_LEN. */
198
199 static void
200 internal_function
201 build_wcs_buffer (re_string_t *pstr)
202 {
203 #ifdef _LIBC
204 unsigned char buf[MB_LEN_MAX];
205 assert (MB_LEN_MAX >= pstr->mb_cur_max);
206 #else
207 unsigned char buf[64];
208 #endif
209 mbstate_t prev_st;
210 Idx byte_idx, end_idx, remain_len;
211 size_t mbclen;
212
213 /* Build the buffers from pstr->valid_len to either pstr->len or
214 pstr->bufs_len. */
215 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
216 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
217 {
218 wchar_t wc;
219 const char *p;
220
221 remain_len = end_idx - byte_idx;
222 prev_st = pstr->cur_state;
223 /* Apply the translation if we need. */
224 if (BE (pstr->trans != NULL, 0))
225 {
226 int i, ch;
227
228 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
229 {
230 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
231 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
232 }
233 p = (const char *) buf;
234 }
235 else
236 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
237 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
238 if (BE (mbclen == (size_t) -1 || mbclen == 0
239 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
240 {
241 /* We treat these cases as a singlebyte character. */
242 mbclen = 1;
243 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
244 if (BE (pstr->trans != NULL, 0))
245 wc = pstr->trans[wc];
246 pstr->cur_state = prev_st;
247 }
248 else if (BE (mbclen == (size_t) -2, 0))
249 {
250 /* The buffer doesn't have enough space, finish to build. */
251 pstr->cur_state = prev_st;
252 break;
253 }
254
255 /* Write wide character and padding. */
256 pstr->wcs[byte_idx++] = wc;
257 /* Write paddings. */
258 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
259 pstr->wcs[byte_idx++] = WEOF;
260 }
261 pstr->valid_len = byte_idx;
262 pstr->valid_raw_len = byte_idx;
263 }
264
265 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
266 but for REG_ICASE. */
267
268 static reg_errcode_t
269 internal_function __attribute_warn_unused_result__
270 build_wcs_upper_buffer (re_string_t *pstr)
271 {
272 mbstate_t prev_st;
273 Idx src_idx, byte_idx, end_idx, remain_len;
274 size_t mbclen;
275 #ifdef _LIBC
276 char buf[MB_LEN_MAX];
277 assert (MB_LEN_MAX >= pstr->mb_cur_max);
278 #else
279 char buf[64];
280 #endif
281
282 byte_idx = pstr->valid_len;
283 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
284
285 /* The following optimization assumes that ASCII characters can be
286 mapped to wide characters with a simple cast. */
287 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
288 {
289 while (byte_idx < end_idx)
290 {
291 wchar_t wc;
292
293 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
294 && mbsinit (&pstr->cur_state))
295 {
296 /* In case of a singlebyte character. */
297 pstr->mbs[byte_idx]
298 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
299 /* The next step uses the assumption that wchar_t is encoded
300 ASCII-safe: all ASCII values can be converted like this. */
301 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
302 ++byte_idx;
303 continue;
304 }
305
306 remain_len = end_idx - byte_idx;
307 prev_st = pstr->cur_state;
308 mbclen = __mbrtowc (&wc,
309 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
310 + byte_idx), remain_len, &pstr->cur_state);
311 if (BE (mbclen < (size_t) -2, 1))
312 {
313 wchar_t wcu = wc;
314 if (iswlower (wc))
315 {
316 size_t mbcdlen;
317
318 wcu = towupper (wc);
319 mbcdlen = wcrtomb (buf, wcu, &prev_st);
320 if (BE (mbclen == mbcdlen, 1))
321 memcpy (pstr->mbs + byte_idx, buf, mbclen);
322 else
323 {
324 src_idx = byte_idx;
325 goto offsets_needed;
326 }
327 }
328 else
329 memcpy (pstr->mbs + byte_idx,
330 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
331 pstr->wcs[byte_idx++] = wcu;
332 /* Write paddings. */
333 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
334 pstr->wcs[byte_idx++] = WEOF;
335 }
336 else if (mbclen == (size_t) -1 || mbclen == 0
337 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
338 {
339 /* It is an invalid character, an incomplete character
340 at the end of the string, or '\0'. Just use the byte. */
341 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
342 pstr->mbs[byte_idx] = ch;
343 /* And also cast it to wide char. */
344 pstr->wcs[byte_idx++] = (wchar_t) ch;
345 if (BE (mbclen == (size_t) -1, 0))
346 pstr->cur_state = prev_st;
347 }
348 else
349 {
350 /* The buffer doesn't have enough space, finish to build. */
351 pstr->cur_state = prev_st;
352 break;
353 }
354 }
355 pstr->valid_len = byte_idx;
356 pstr->valid_raw_len = byte_idx;
357 return REG_NOERROR;
358 }
359 else
360 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
361 {
362 wchar_t wc;
363 const char *p;
364 offsets_needed:
365 remain_len = end_idx - byte_idx;
366 prev_st = pstr->cur_state;
367 if (BE (pstr->trans != NULL, 0))
368 {
369 int i, ch;
370
371 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
372 {
373 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
374 buf[i] = pstr->trans[ch];
375 }
376 p = (const char *) buf;
377 }
378 else
379 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
380 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
381 if (BE (mbclen < (size_t) -2, 1))
382 {
383 wchar_t wcu = wc;
384 if (iswlower (wc))
385 {
386 size_t mbcdlen;
387
388 wcu = towupper (wc);
389 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
390 if (BE (mbclen == mbcdlen, 1))
391 memcpy (pstr->mbs + byte_idx, buf, mbclen);
392 else if (mbcdlen != (size_t) -1)
393 {
394 size_t i;
395
396 if (byte_idx + mbcdlen > pstr->bufs_len)
397 {
398 pstr->cur_state = prev_st;
399 break;
400 }
401
402 if (pstr->offsets == NULL)
403 {
404 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
405
406 if (pstr->offsets == NULL)
407 return REG_ESPACE;
408 }
409 if (!pstr->offsets_needed)
410 {
411 for (i = 0; i < (size_t) byte_idx; ++i)
412 pstr->offsets[i] = i;
413 pstr->offsets_needed = 1;
414 }
415
416 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
417 pstr->wcs[byte_idx] = wcu;
418 pstr->offsets[byte_idx] = src_idx;
419 for (i = 1; i < mbcdlen; ++i)
420 {
421 pstr->offsets[byte_idx + i]
422 = src_idx + (i < mbclen ? i : mbclen - 1);
423 pstr->wcs[byte_idx + i] = WEOF;
424 }
425 pstr->len += mbcdlen - mbclen;
426 if (pstr->raw_stop > src_idx)
427 pstr->stop += mbcdlen - mbclen;
428 end_idx = (pstr->bufs_len > pstr->len)
429 ? pstr->len : pstr->bufs_len;
430 byte_idx += mbcdlen;
431 src_idx += mbclen;
432 continue;
433 }
434 else
435 memcpy (pstr->mbs + byte_idx, p, mbclen);
436 }
437 else
438 memcpy (pstr->mbs + byte_idx, p, mbclen);
439
440 if (BE (pstr->offsets_needed != 0, 0))
441 {
442 size_t i;
443 for (i = 0; i < mbclen; ++i)
444 pstr->offsets[byte_idx + i] = src_idx + i;
445 }
446 src_idx += mbclen;
447
448 pstr->wcs[byte_idx++] = wcu;
449 /* Write paddings. */
450 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
451 pstr->wcs[byte_idx++] = WEOF;
452 }
453 else if (mbclen == (size_t) -1 || mbclen == 0
454 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
455 {
456 /* It is an invalid character or '\0'. Just use the byte. */
457 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
458
459 if (BE (pstr->trans != NULL, 0))
460 ch = pstr->trans [ch];
461 pstr->mbs[byte_idx] = ch;
462
463 if (BE (pstr->offsets_needed != 0, 0))
464 pstr->offsets[byte_idx] = src_idx;
465 ++src_idx;
466
467 /* And also cast it to wide char. */
468 pstr->wcs[byte_idx++] = (wchar_t) ch;
469 if (BE (mbclen == (size_t) -1, 0))
470 pstr->cur_state = prev_st;
471 }
472 else
473 {
474 /* The buffer doesn't have enough space, finish to build. */
475 pstr->cur_state = prev_st;
476 break;
477 }
478 }
479 pstr->valid_len = byte_idx;
480 pstr->valid_raw_len = src_idx;
481 return REG_NOERROR;
482 }
483
484 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
485 Return the index. */
486
487 static Idx
488 internal_function
489 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
490 {
491 mbstate_t prev_st;
492 Idx rawbuf_idx;
493 size_t mbclen;
494 wint_t wc = WEOF;
495
496 /* Skip the characters which are not necessary to check. */
497 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
498 rawbuf_idx < new_raw_idx;)
499 {
500 wchar_t wc2;
501 Idx remain_len = pstr->raw_len - rawbuf_idx;
502 prev_st = pstr->cur_state;
503 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
504 remain_len, &pstr->cur_state);
505 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
506 {
507 /* We treat these cases as a single byte character. */
508 if (mbclen == 0 || remain_len == 0)
509 wc = L'\0';
510 else
511 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
512 mbclen = 1;
513 pstr->cur_state = prev_st;
514 }
515 else
516 wc = wc2;
517 /* Then proceed the next character. */
518 rawbuf_idx += mbclen;
519 }
520 *last_wc = wc;
521 return rawbuf_idx;
522 }
523 #endif /* RE_ENABLE_I18N */
524
525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
526 This function is used in case of REG_ICASE. */
527
528 static void
529 internal_function
530 build_upper_buffer (re_string_t *pstr)
531 {
532 Idx char_idx, end_idx;
533 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
534
535 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
536 {
537 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
538 if (BE (pstr->trans != NULL, 0))
539 ch = pstr->trans[ch];
540 if (islower (ch))
541 pstr->mbs[char_idx] = toupper (ch);
542 else
543 pstr->mbs[char_idx] = ch;
544 }
545 pstr->valid_len = char_idx;
546 pstr->valid_raw_len = char_idx;
547 }
548
549 /* Apply TRANS to the buffer in PSTR. */
550
551 static void
552 internal_function
553 re_string_translate_buffer (re_string_t *pstr)
554 {
555 Idx buf_idx, end_idx;
556 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
557
558 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
559 {
560 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
561 pstr->mbs[buf_idx] = pstr->trans[ch];
562 }
563
564 pstr->valid_len = buf_idx;
565 pstr->valid_raw_len = buf_idx;
566 }
567
568 /* This function re-construct the buffers.
569 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
570 convert to upper case in case of REG_ICASE, apply translation. */
571
572 static reg_errcode_t
573 internal_function __attribute_warn_unused_result__
574 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
575 {
576 Idx offset;
577
578 if (BE (pstr->raw_mbs_idx <= idx, 0))
579 offset = idx - pstr->raw_mbs_idx;
580 else
581 {
582 /* Reset buffer. */
583 #ifdef RE_ENABLE_I18N
584 if (pstr->mb_cur_max > 1)
585 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
586 #endif /* RE_ENABLE_I18N */
587 pstr->len = pstr->raw_len;
588 pstr->stop = pstr->raw_stop;
589 pstr->valid_len = 0;
590 pstr->raw_mbs_idx = 0;
591 pstr->valid_raw_len = 0;
592 pstr->offsets_needed = 0;
593 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
594 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
595 if (!pstr->mbs_allocated)
596 pstr->mbs = (unsigned char *) pstr->raw_mbs;
597 offset = idx;
598 }
599
600 if (BE (offset != 0, 1))
601 {
602 /* Should the already checked characters be kept? */
603 if (BE (offset < pstr->valid_raw_len, 1))
604 {
605 /* Yes, move them to the front of the buffer. */
606 #ifdef RE_ENABLE_I18N
607 if (BE (pstr->offsets_needed, 0))
608 {
609 Idx low = 0, high = pstr->valid_len, mid;
610 do
611 {
612 mid = (high + low) / 2;
613 if (pstr->offsets[mid] > offset)
614 high = mid;
615 else if (pstr->offsets[mid] < offset)
616 low = mid + 1;
617 else
618 break;
619 }
620 while (low < high);
621 if (pstr->offsets[mid] < offset)
622 ++mid;
623 pstr->tip_context = re_string_context_at (pstr, mid - 1,
624 eflags);
625 /* This can be quite complicated, so handle specially
626 only the common and easy case where the character with
627 different length representation of lower and upper
628 case is present at or after offset. */
629 if (pstr->valid_len > offset
630 && mid == offset && pstr->offsets[mid] == offset)
631 {
632 memmove (pstr->wcs, pstr->wcs + offset,
633 (pstr->valid_len - offset) * sizeof (wint_t));
634 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
635 pstr->valid_len -= offset;
636 pstr->valid_raw_len -= offset;
637 for (low = 0; low < pstr->valid_len; low++)
638 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
639 }
640 else
641 {
642 /* Otherwise, just find out how long the partial multibyte
643 character at offset is and fill it with WEOF/255. */
644 pstr->len = pstr->raw_len - idx + offset;
645 pstr->stop = pstr->raw_stop - idx + offset;
646 pstr->offsets_needed = 0;
647 while (mid > 0 && pstr->offsets[mid - 1] == offset)
648 --mid;
649 while (mid < pstr->valid_len)
650 if (pstr->wcs[mid] != WEOF)
651 break;
652 else
653 ++mid;
654 if (mid == pstr->valid_len)
655 pstr->valid_len = 0;
656 else
657 {
658 pstr->valid_len = pstr->offsets[mid] - offset;
659 if (pstr->valid_len)
660 {
661 for (low = 0; low < pstr->valid_len; ++low)
662 pstr->wcs[low] = WEOF;
663 memset (pstr->mbs, 255, pstr->valid_len);
664 }
665 }
666 pstr->valid_raw_len = pstr->valid_len;
667 }
668 }
669 else
670 #endif
671 {
672 pstr->tip_context = re_string_context_at (pstr, offset - 1,
673 eflags);
674 #ifdef RE_ENABLE_I18N
675 if (pstr->mb_cur_max > 1)
676 memmove (pstr->wcs, pstr->wcs + offset,
677 (pstr->valid_len - offset) * sizeof (wint_t));
678 #endif /* RE_ENABLE_I18N */
679 if (BE (pstr->mbs_allocated, 0))
680 memmove (pstr->mbs, pstr->mbs + offset,
681 pstr->valid_len - offset);
682 pstr->valid_len -= offset;
683 pstr->valid_raw_len -= offset;
684 #if DEBUG
685 assert (pstr->valid_len > 0);
686 #endif
687 }
688 }
689 else
690 {
691 #ifdef RE_ENABLE_I18N
692 /* No, skip all characters until IDX. */
693 Idx prev_valid_len = pstr->valid_len;
694
695 if (BE (pstr->offsets_needed, 0))
696 {
697 pstr->len = pstr->raw_len - idx + offset;
698 pstr->stop = pstr->raw_stop - idx + offset;
699 pstr->offsets_needed = 0;
700 }
701 #endif
702 pstr->valid_len = 0;
703 #ifdef RE_ENABLE_I18N
704 if (pstr->mb_cur_max > 1)
705 {
706 Idx wcs_idx;
707 wint_t wc = WEOF;
708
709 if (pstr->is_utf8)
710 {
711 const unsigned char *raw, *p, *end;
712
713 /* Special case UTF-8. Multi-byte chars start with any
714 byte other than 0x80 - 0xbf. */
715 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
716 end = raw + (offset - pstr->mb_cur_max);
717 if (end < pstr->raw_mbs)
718 end = pstr->raw_mbs;
719 p = raw + offset - 1;
720 #ifdef _LIBC
721 /* We know the wchar_t encoding is UCS4, so for the simple
722 case, ASCII characters, skip the conversion step. */
723 if (isascii (*p) && BE (pstr->trans == NULL, 1))
724 {
725 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
726 /* pstr->valid_len = 0; */
727 wc = (wchar_t) *p;
728 }
729 else
730 #endif
731 for (; p >= end; --p)
732 if ((*p & 0xc0) != 0x80)
733 {
734 mbstate_t cur_state;
735 wchar_t wc2;
736 Idx mlen = raw + pstr->len - p;
737 unsigned char buf[6];
738 size_t mbclen;
739
740 const unsigned char *pp = p;
741 if (BE (pstr->trans != NULL, 0))
742 {
743 int i = mlen < 6 ? mlen : 6;
744 while (--i >= 0)
745 buf[i] = pstr->trans[p[i]];
746 pp = buf;
747 }
748 /* XXX Don't use mbrtowc, we know which conversion
749 to use (UTF-8 -> UCS4). */
750 memset (&cur_state, 0, sizeof (cur_state));
751 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
752 &cur_state);
753 if (raw + offset - p <= mbclen
754 && mbclen < (size_t) -2)
755 {
756 memset (&pstr->cur_state, '\0',
757 sizeof (mbstate_t));
758 pstr->valid_len = mbclen - (raw + offset - p);
759 wc = wc2;
760 }
761 break;
762 }
763 }
764
765 if (wc == WEOF)
766 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
767 if (wc == WEOF)
768 pstr->tip_context
769 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
770 else
771 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
772 && IS_WIDE_WORD_CHAR (wc))
773 ? CONTEXT_WORD
774 : ((IS_WIDE_NEWLINE (wc)
775 && pstr->newline_anchor)
776 ? CONTEXT_NEWLINE : 0));
777 if (BE (pstr->valid_len, 0))
778 {
779 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
780 pstr->wcs[wcs_idx] = WEOF;
781 if (pstr->mbs_allocated)
782 memset (pstr->mbs, 255, pstr->valid_len);
783 }
784 pstr->valid_raw_len = pstr->valid_len;
785 }
786 else
787 #endif /* RE_ENABLE_I18N */
788 {
789 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
790 pstr->valid_raw_len = 0;
791 if (pstr->trans)
792 c = pstr->trans[c];
793 pstr->tip_context = (bitset_contain (pstr->word_char, c)
794 ? CONTEXT_WORD
795 : ((IS_NEWLINE (c) && pstr->newline_anchor)
796 ? CONTEXT_NEWLINE : 0));
797 }
798 }
799 if (!BE (pstr->mbs_allocated, 0))
800 pstr->mbs += offset;
801 }
802 pstr->raw_mbs_idx = idx;
803 pstr->len -= offset;
804 pstr->stop -= offset;
805
806 /* Then build the buffers. */
807 #ifdef RE_ENABLE_I18N
808 if (pstr->mb_cur_max > 1)
809 {
810 if (pstr->icase)
811 {
812 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
813 if (BE (ret != REG_NOERROR, 0))
814 return ret;
815 }
816 else
817 build_wcs_buffer (pstr);
818 }
819 else
820 #endif /* RE_ENABLE_I18N */
821 if (BE (pstr->mbs_allocated, 0))
822 {
823 if (pstr->icase)
824 build_upper_buffer (pstr);
825 else if (pstr->trans != NULL)
826 re_string_translate_buffer (pstr);
827 }
828 else
829 pstr->valid_len = pstr->len;
830
831 pstr->cur_idx = 0;
832 return REG_NOERROR;
833 }
834
835 static unsigned char
836 internal_function __attribute ((pure))
837 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
838 {
839 int ch;
840 Idx off;
841
842 /* Handle the common (easiest) cases first. */
843 if (BE (!pstr->mbs_allocated, 1))
844 return re_string_peek_byte (pstr, idx);
845
846 #ifdef RE_ENABLE_I18N
847 if (pstr->mb_cur_max > 1
848 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
849 return re_string_peek_byte (pstr, idx);
850 #endif
851
852 off = pstr->cur_idx + idx;
853 #ifdef RE_ENABLE_I18N
854 if (pstr->offsets_needed)
855 off = pstr->offsets[off];
856 #endif
857
858 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
859
860 #ifdef RE_ENABLE_I18N
861 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
862 this function returns CAPITAL LETTER I instead of first byte of
863 DOTLESS SMALL LETTER I. The latter would confuse the parser,
864 since peek_byte_case doesn't advance cur_idx in any way. */
865 if (pstr->offsets_needed && !isascii (ch))
866 return re_string_peek_byte (pstr, idx);
867 #endif
868
869 return ch;
870 }
871
872 static unsigned char
873 internal_function
874 re_string_fetch_byte_case (re_string_t *pstr)
875 {
876 if (BE (!pstr->mbs_allocated, 1))
877 return re_string_fetch_byte (pstr);
878
879 #ifdef RE_ENABLE_I18N
880 if (pstr->offsets_needed)
881 {
882 Idx off;
883 int ch;
884
885 /* For tr_TR.UTF-8 [[:islower:]] there is
886 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
887 in that case the whole multi-byte character and return
888 the original letter. On the other side, with
889 [[: DOTLESS SMALL LETTER I return [[:I, as doing
890 anything else would complicate things too much. */
891
892 if (!re_string_first_byte (pstr, pstr->cur_idx))
893 return re_string_fetch_byte (pstr);
894
895 off = pstr->offsets[pstr->cur_idx];
896 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
897
898 if (! isascii (ch))
899 return re_string_fetch_byte (pstr);
900
901 re_string_skip_bytes (pstr,
902 re_string_char_size_at (pstr, pstr->cur_idx));
903 return ch;
904 }
905 #endif
906
907 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
908 }
909
910 static void
911 internal_function
912 re_string_destruct (re_string_t *pstr)
913 {
914 #ifdef RE_ENABLE_I18N
915 re_free (pstr->wcs);
916 re_free (pstr->offsets);
917 #endif /* RE_ENABLE_I18N */
918 if (pstr->mbs_allocated)
919 re_free (pstr->mbs);
920 }
921
922 /* Return the context at IDX in INPUT. */
923
924 static unsigned int
925 internal_function
926 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
927 {
928 int c;
929 if (BE (! REG_VALID_INDEX (idx), 0))
930 /* In this case, we use the value stored in input->tip_context,
931 since we can't know the character in input->mbs[-1] here. */
932 return input->tip_context;
933 if (BE (idx == input->len, 0))
934 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
935 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
936 #ifdef RE_ENABLE_I18N
937 if (input->mb_cur_max > 1)
938 {
939 wint_t wc;
940 Idx wc_idx = idx;
941 while(input->wcs[wc_idx] == WEOF)
942 {
943 #ifdef DEBUG
944 /* It must not happen. */
945 assert (REG_VALID_INDEX (wc_idx));
946 #endif
947 --wc_idx;
948 if (! REG_VALID_INDEX (wc_idx))
949 return input->tip_context;
950 }
951 wc = input->wcs[wc_idx];
952 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
953 return CONTEXT_WORD;
954 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
955 ? CONTEXT_NEWLINE : 0);
956 }
957 else
958 #endif
959 {
960 c = re_string_byte_at (input, idx);
961 if (bitset_contain (input->word_char, c))
962 return CONTEXT_WORD;
963 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
964 }
965 }
966 \f
967 /* Functions for set operation. */
968
969 static reg_errcode_t
970 internal_function __attribute_warn_unused_result__
971 re_node_set_alloc (re_node_set *set, Idx size)
972 {
973 set->alloc = size;
974 set->nelem = 0;
975 set->elems = re_malloc (Idx, size);
976 if (BE (set->elems == NULL, 0))
977 return REG_ESPACE;
978 return REG_NOERROR;
979 }
980
981 static reg_errcode_t
982 internal_function __attribute_warn_unused_result__
983 re_node_set_init_1 (re_node_set *set, Idx elem)
984 {
985 set->alloc = 1;
986 set->nelem = 1;
987 set->elems = re_malloc (Idx, 1);
988 if (BE (set->elems == NULL, 0))
989 {
990 set->alloc = set->nelem = 0;
991 return REG_ESPACE;
992 }
993 set->elems[0] = elem;
994 return REG_NOERROR;
995 }
996
997 static reg_errcode_t
998 internal_function __attribute_warn_unused_result__
999 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1000 {
1001 set->alloc = 2;
1002 set->elems = re_malloc (Idx, 2);
1003 if (BE (set->elems == NULL, 0))
1004 return REG_ESPACE;
1005 if (elem1 == elem2)
1006 {
1007 set->nelem = 1;
1008 set->elems[0] = elem1;
1009 }
1010 else
1011 {
1012 set->nelem = 2;
1013 if (elem1 < elem2)
1014 {
1015 set->elems[0] = elem1;
1016 set->elems[1] = elem2;
1017 }
1018 else
1019 {
1020 set->elems[0] = elem2;
1021 set->elems[1] = elem1;
1022 }
1023 }
1024 return REG_NOERROR;
1025 }
1026
1027 static reg_errcode_t
1028 internal_function __attribute_warn_unused_result__
1029 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1030 {
1031 dest->nelem = src->nelem;
1032 if (src->nelem > 0)
1033 {
1034 dest->alloc = dest->nelem;
1035 dest->elems = re_malloc (Idx, dest->alloc);
1036 if (BE (dest->elems == NULL, 0))
1037 {
1038 dest->alloc = dest->nelem = 0;
1039 return REG_ESPACE;
1040 }
1041 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1042 }
1043 else
1044 re_node_set_init_empty (dest);
1045 return REG_NOERROR;
1046 }
1047
1048 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1049 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1050 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1051
1052 static reg_errcode_t
1053 internal_function __attribute_warn_unused_result__
1054 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1055 const re_node_set *src2)
1056 {
1057 Idx i1, i2, is, id, delta, sbase;
1058 if (src1->nelem == 0 || src2->nelem == 0)
1059 return REG_NOERROR;
1060
1061 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1062 conservative estimate. */
1063 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1064 {
1065 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1066 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1067 if (BE (new_elems == NULL, 0))
1068 return REG_ESPACE;
1069 dest->elems = new_elems;
1070 dest->alloc = new_alloc;
1071 }
1072
1073 /* Find the items in the intersection of SRC1 and SRC2, and copy
1074 into the top of DEST those that are not already in DEST itself. */
1075 sbase = dest->nelem + src1->nelem + src2->nelem;
1076 i1 = src1->nelem - 1;
1077 i2 = src2->nelem - 1;
1078 id = dest->nelem - 1;
1079 for (;;)
1080 {
1081 if (src1->elems[i1] == src2->elems[i2])
1082 {
1083 /* Try to find the item in DEST. Maybe we could binary search? */
1084 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1085 --id;
1086
1087 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1088 dest->elems[--sbase] = src1->elems[i1];
1089
1090 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1091 break;
1092 }
1093
1094 /* Lower the highest of the two items. */
1095 else if (src1->elems[i1] < src2->elems[i2])
1096 {
1097 if (! REG_VALID_INDEX (--i2))
1098 break;
1099 }
1100 else
1101 {
1102 if (! REG_VALID_INDEX (--i1))
1103 break;
1104 }
1105 }
1106
1107 id = dest->nelem - 1;
1108 is = dest->nelem + src1->nelem + src2->nelem - 1;
1109 delta = is - sbase + 1;
1110
1111 /* Now copy. When DELTA becomes zero, the remaining
1112 DEST elements are already in place; this is more or
1113 less the same loop that is in re_node_set_merge. */
1114 dest->nelem += delta;
1115 if (delta > 0 && REG_VALID_INDEX (id))
1116 for (;;)
1117 {
1118 if (dest->elems[is] > dest->elems[id])
1119 {
1120 /* Copy from the top. */
1121 dest->elems[id + delta--] = dest->elems[is--];
1122 if (delta == 0)
1123 break;
1124 }
1125 else
1126 {
1127 /* Slide from the bottom. */
1128 dest->elems[id + delta] = dest->elems[id];
1129 if (! REG_VALID_INDEX (--id))
1130 break;
1131 }
1132 }
1133
1134 /* Copy remaining SRC elements. */
1135 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1136
1137 return REG_NOERROR;
1138 }
1139
1140 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1141 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1142
1143 static reg_errcode_t
1144 internal_function __attribute_warn_unused_result__
1145 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1146 const re_node_set *src2)
1147 {
1148 Idx i1, i2, id;
1149 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1150 {
1151 dest->alloc = src1->nelem + src2->nelem;
1152 dest->elems = re_malloc (Idx, dest->alloc);
1153 if (BE (dest->elems == NULL, 0))
1154 return REG_ESPACE;
1155 }
1156 else
1157 {
1158 if (src1 != NULL && src1->nelem > 0)
1159 return re_node_set_init_copy (dest, src1);
1160 else if (src2 != NULL && src2->nelem > 0)
1161 return re_node_set_init_copy (dest, src2);
1162 else
1163 re_node_set_init_empty (dest);
1164 return REG_NOERROR;
1165 }
1166 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1167 {
1168 if (src1->elems[i1] > src2->elems[i2])
1169 {
1170 dest->elems[id++] = src2->elems[i2++];
1171 continue;
1172 }
1173 if (src1->elems[i1] == src2->elems[i2])
1174 ++i2;
1175 dest->elems[id++] = src1->elems[i1++];
1176 }
1177 if (i1 < src1->nelem)
1178 {
1179 memcpy (dest->elems + id, src1->elems + i1,
1180 (src1->nelem - i1) * sizeof (Idx));
1181 id += src1->nelem - i1;
1182 }
1183 else if (i2 < src2->nelem)
1184 {
1185 memcpy (dest->elems + id, src2->elems + i2,
1186 (src2->nelem - i2) * sizeof (Idx));
1187 id += src2->nelem - i2;
1188 }
1189 dest->nelem = id;
1190 return REG_NOERROR;
1191 }
1192
1193 /* Calculate the union set of the sets DEST and SRC. And store it to
1194 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1195
1196 static reg_errcode_t
1197 internal_function __attribute_warn_unused_result__
1198 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1199 {
1200 Idx is, id, sbase, delta;
1201 if (src == NULL || src->nelem == 0)
1202 return REG_NOERROR;
1203 if (dest->alloc < 2 * src->nelem + dest->nelem)
1204 {
1205 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1206 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1207 if (BE (new_buffer == NULL, 0))
1208 return REG_ESPACE;
1209 dest->elems = new_buffer;
1210 dest->alloc = new_alloc;
1211 }
1212
1213 if (BE (dest->nelem == 0, 0))
1214 {
1215 dest->nelem = src->nelem;
1216 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1217 return REG_NOERROR;
1218 }
1219
1220 /* Copy into the top of DEST the items of SRC that are not
1221 found in DEST. Maybe we could binary search in DEST? */
1222 for (sbase = dest->nelem + 2 * src->nelem,
1223 is = src->nelem - 1, id = dest->nelem - 1;
1224 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1225 {
1226 if (dest->elems[id] == src->elems[is])
1227 is--, id--;
1228 else if (dest->elems[id] < src->elems[is])
1229 dest->elems[--sbase] = src->elems[is--];
1230 else /* if (dest->elems[id] > src->elems[is]) */
1231 --id;
1232 }
1233
1234 if (REG_VALID_INDEX (is))
1235 {
1236 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1237 sbase -= is + 1;
1238 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1239 }
1240
1241 id = dest->nelem - 1;
1242 is = dest->nelem + 2 * src->nelem - 1;
1243 delta = is - sbase + 1;
1244 if (delta == 0)
1245 return REG_NOERROR;
1246
1247 /* Now copy. When DELTA becomes zero, the remaining
1248 DEST elements are already in place. */
1249 dest->nelem += delta;
1250 for (;;)
1251 {
1252 if (dest->elems[is] > dest->elems[id])
1253 {
1254 /* Copy from the top. */
1255 dest->elems[id + delta--] = dest->elems[is--];
1256 if (delta == 0)
1257 break;
1258 }
1259 else
1260 {
1261 /* Slide from the bottom. */
1262 dest->elems[id + delta] = dest->elems[id];
1263 if (! REG_VALID_INDEX (--id))
1264 {
1265 /* Copy remaining SRC elements. */
1266 memcpy (dest->elems, dest->elems + sbase,
1267 delta * sizeof (Idx));
1268 break;
1269 }
1270 }
1271 }
1272
1273 return REG_NOERROR;
1274 }
1275
1276 /* Insert the new element ELEM to the re_node_set* SET.
1277 SET should not already have ELEM.
1278 Return true if successful. */
1279
1280 static bool
1281 internal_function __attribute_warn_unused_result__
1282 re_node_set_insert (re_node_set *set, Idx elem)
1283 {
1284 Idx idx;
1285 /* In case the set is empty. */
1286 if (set->alloc == 0)
1287 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1288
1289 if (BE (set->nelem, 0) == 0)
1290 {
1291 /* We already guaranteed above that set->alloc != 0. */
1292 set->elems[0] = elem;
1293 ++set->nelem;
1294 return true;
1295 }
1296
1297 /* Realloc if we need. */
1298 if (set->alloc == set->nelem)
1299 {
1300 Idx *new_elems;
1301 set->alloc = set->alloc * 2;
1302 new_elems = re_realloc (set->elems, Idx, set->alloc);
1303 if (BE (new_elems == NULL, 0))
1304 return false;
1305 set->elems = new_elems;
1306 }
1307
1308 /* Move the elements which follows the new element. Test the
1309 first element separately to skip a check in the inner loop. */
1310 if (elem < set->elems[0])
1311 {
1312 idx = 0;
1313 for (idx = set->nelem; idx > 0; idx--)
1314 set->elems[idx] = set->elems[idx - 1];
1315 }
1316 else
1317 {
1318 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319 set->elems[idx] = set->elems[idx - 1];
1320 }
1321
1322 /* Insert the new element. */
1323 set->elems[idx] = elem;
1324 ++set->nelem;
1325 return true;
1326 }
1327
1328 /* Insert the new element ELEM to the re_node_set* SET.
1329 SET should not already have any element greater than or equal to ELEM.
1330 Return true if successful. */
1331
1332 static bool
1333 internal_function __attribute_warn_unused_result__
1334 re_node_set_insert_last (re_node_set *set, Idx elem)
1335 {
1336 /* Realloc if we need. */
1337 if (set->alloc == set->nelem)
1338 {
1339 Idx *new_elems;
1340 set->alloc = (set->alloc + 1) * 2;
1341 new_elems = re_realloc (set->elems, Idx, set->alloc);
1342 if (BE (new_elems == NULL, 0))
1343 return false;
1344 set->elems = new_elems;
1345 }
1346
1347 /* Insert the new element. */
1348 set->elems[set->nelem++] = elem;
1349 return true;
1350 }
1351
1352 /* Compare two node sets SET1 and SET2.
1353 Return true if SET1 and SET2 are equivalent. */
1354
1355 static bool
1356 internal_function __attribute ((pure))
1357 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1358 {
1359 Idx i;
1360 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1361 return false;
1362 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1363 if (set1->elems[i] != set2->elems[i])
1364 return false;
1365 return true;
1366 }
1367
1368 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1369
1370 static Idx
1371 internal_function __attribute ((pure))
1372 re_node_set_contains (const re_node_set *set, Idx elem)
1373 {
1374 __re_size_t idx, right, mid;
1375 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1376 return 0;
1377
1378 /* Binary search the element. */
1379 idx = 0;
1380 right = set->nelem - 1;
1381 while (idx < right)
1382 {
1383 mid = (idx + right) / 2;
1384 if (set->elems[mid] < elem)
1385 idx = mid + 1;
1386 else
1387 right = mid;
1388 }
1389 return set->elems[idx] == elem ? idx + 1 : 0;
1390 }
1391
1392 static void
1393 internal_function
1394 re_node_set_remove_at (re_node_set *set, Idx idx)
1395 {
1396 if (idx < 0 || idx >= set->nelem)
1397 return;
1398 --set->nelem;
1399 for (; idx < set->nelem; idx++)
1400 set->elems[idx] = set->elems[idx + 1];
1401 }
1402 \f
1403
1404 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1405 Or return REG_MISSING if an error occurred. */
1406
1407 static Idx
1408 internal_function
1409 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410 {
1411 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1412 {
1413 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1414 Idx *new_nexts, *new_indices;
1415 re_node_set *new_edests, *new_eclosures;
1416 re_token_t *new_nodes;
1417
1418 /* Avoid overflows in realloc. */
1419 const size_t max_object_size = MAX (sizeof (re_token_t),
1420 MAX (sizeof (re_node_set),
1421 sizeof (Idx)));
1422 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1423 return REG_MISSING;
1424
1425 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1426 if (BE (new_nodes == NULL, 0))
1427 return REG_MISSING;
1428 dfa->nodes = new_nodes;
1429 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1430 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1431 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1432 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1433 if (BE (new_nexts == NULL || new_indices == NULL
1434 || new_edests == NULL || new_eclosures == NULL, 0))
1435 return REG_MISSING;
1436 dfa->nexts = new_nexts;
1437 dfa->org_indices = new_indices;
1438 dfa->edests = new_edests;
1439 dfa->eclosures = new_eclosures;
1440 dfa->nodes_alloc = new_nodes_alloc;
1441 }
1442 dfa->nodes[dfa->nodes_len] = token;
1443 dfa->nodes[dfa->nodes_len].constraint = 0;
1444 #ifdef RE_ENABLE_I18N
1445 {
1446 int type = token.type;
1447 dfa->nodes[dfa->nodes_len].accept_mb =
1448 (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1449 }
1450 #endif
1451 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1452 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1453 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1454 return dfa->nodes_len++;
1455 }
1456
1457 static inline re_hashval_t
1458 internal_function
1459 calc_state_hash (const re_node_set *nodes, unsigned int context)
1460 {
1461 re_hashval_t hash = nodes->nelem + context;
1462 Idx i;
1463 for (i = 0 ; i < nodes->nelem ; i++)
1464 hash += nodes->elems[i];
1465 return hash;
1466 }
1467
1468 /* Search for the state whose node_set is equivalent to NODES.
1469 Return the pointer to the state, if we found it in the DFA.
1470 Otherwise create the new one and return it. In case of an error
1471 return NULL and set the error code in ERR.
1472 Note: - We assume NULL as the invalid state, then it is possible that
1473 return value is NULL and ERR is REG_NOERROR.
1474 - We never return non-NULL value in case of any errors, it is for
1475 optimization. */
1476
1477 static re_dfastate_t *
1478 internal_function __attribute_warn_unused_result__
1479 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1480 const re_node_set *nodes)
1481 {
1482 re_hashval_t hash;
1483 re_dfastate_t *new_state;
1484 struct re_state_table_entry *spot;
1485 Idx i;
1486 #ifdef lint
1487 /* Suppress bogus uninitialized-variable warnings. */
1488 *err = REG_NOERROR;
1489 #endif
1490 if (BE (nodes->nelem == 0, 0))
1491 {
1492 *err = REG_NOERROR;
1493 return NULL;
1494 }
1495 hash = calc_state_hash (nodes, 0);
1496 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1497
1498 for (i = 0 ; i < spot->num ; i++)
1499 {
1500 re_dfastate_t *state = spot->array[i];
1501 if (hash != state->hash)
1502 continue;
1503 if (re_node_set_compare (&state->nodes, nodes))
1504 return state;
1505 }
1506
1507 /* There are no appropriate state in the dfa, create the new one. */
1508 new_state = create_ci_newstate (dfa, nodes, hash);
1509 if (BE (new_state == NULL, 0))
1510 *err = REG_ESPACE;
1511
1512 return new_state;
1513 }
1514
1515 /* Search for the state whose node_set is equivalent to NODES and
1516 whose context is equivalent to CONTEXT.
1517 Return the pointer to the state, if we found it in the DFA.
1518 Otherwise create the new one and return it. In case of an error
1519 return NULL and set the error code in ERR.
1520 Note: - We assume NULL as the invalid state, then it is possible that
1521 return value is NULL and ERR is REG_NOERROR.
1522 - We never return non-NULL value in case of any errors, it is for
1523 optimization. */
1524
1525 static re_dfastate_t *
1526 internal_function __attribute_warn_unused_result__
1527 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1528 const re_node_set *nodes, unsigned int context)
1529 {
1530 re_hashval_t hash;
1531 re_dfastate_t *new_state;
1532 struct re_state_table_entry *spot;
1533 Idx i;
1534 #ifdef lint
1535 /* Suppress bogus uninitialized-variable warnings. */
1536 *err = REG_NOERROR;
1537 #endif
1538 if (nodes->nelem == 0)
1539 {
1540 *err = REG_NOERROR;
1541 return NULL;
1542 }
1543 hash = calc_state_hash (nodes, context);
1544 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1545
1546 for (i = 0 ; i < spot->num ; i++)
1547 {
1548 re_dfastate_t *state = spot->array[i];
1549 if (state->hash == hash
1550 && state->context == context
1551 && re_node_set_compare (state->entrance_nodes, nodes))
1552 return state;
1553 }
1554 /* There are no appropriate state in 'dfa', create the new one. */
1555 new_state = create_cd_newstate (dfa, nodes, context, hash);
1556 if (BE (new_state == NULL, 0))
1557 *err = REG_ESPACE;
1558
1559 return new_state;
1560 }
1561
1562 /* Finish initialization of the new state NEWSTATE, and using its hash value
1563 HASH put in the appropriate bucket of DFA's state table. Return value
1564 indicates the error code if failed. */
1565
1566 static reg_errcode_t
1567 __attribute_warn_unused_result__
1568 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1569 re_hashval_t hash)
1570 {
1571 struct re_state_table_entry *spot;
1572 reg_errcode_t err;
1573 Idx i;
1574
1575 newstate->hash = hash;
1576 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1577 if (BE (err != REG_NOERROR, 0))
1578 return REG_ESPACE;
1579 for (i = 0; i < newstate->nodes.nelem; i++)
1580 {
1581 Idx elem = newstate->nodes.elems[i];
1582 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1583 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1584 return REG_ESPACE;
1585 }
1586
1587 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1588 if (BE (spot->alloc <= spot->num, 0))
1589 {
1590 Idx new_alloc = 2 * spot->num + 2;
1591 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1592 new_alloc);
1593 if (BE (new_array == NULL, 0))
1594 return REG_ESPACE;
1595 spot->array = new_array;
1596 spot->alloc = new_alloc;
1597 }
1598 spot->array[spot->num++] = newstate;
1599 return REG_NOERROR;
1600 }
1601
1602 static void
1603 free_state (re_dfastate_t *state)
1604 {
1605 re_node_set_free (&state->non_eps_nodes);
1606 re_node_set_free (&state->inveclosure);
1607 if (state->entrance_nodes != &state->nodes)
1608 {
1609 re_node_set_free (state->entrance_nodes);
1610 re_free (state->entrance_nodes);
1611 }
1612 re_node_set_free (&state->nodes);
1613 re_free (state->word_trtable);
1614 re_free (state->trtable);
1615 re_free (state);
1616 }
1617
1618 /* Create the new state which is independent of contexts.
1619 Return the new state if succeeded, otherwise return NULL. */
1620
1621 static re_dfastate_t *
1622 internal_function __attribute_warn_unused_result__
1623 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1624 re_hashval_t hash)
1625 {
1626 Idx i;
1627 reg_errcode_t err;
1628 re_dfastate_t *newstate;
1629
1630 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1631 if (BE (newstate == NULL, 0))
1632 return NULL;
1633 err = re_node_set_init_copy (&newstate->nodes, nodes);
1634 if (BE (err != REG_NOERROR, 0))
1635 {
1636 re_free (newstate);
1637 return NULL;
1638 }
1639
1640 newstate->entrance_nodes = &newstate->nodes;
1641 for (i = 0 ; i < nodes->nelem ; i++)
1642 {
1643 re_token_t *node = dfa->nodes + nodes->elems[i];
1644 re_token_type_t type = node->type;
1645 if (type == CHARACTER && !node->constraint)
1646 continue;
1647 #ifdef RE_ENABLE_I18N
1648 newstate->accept_mb |= node->accept_mb;
1649 #endif /* RE_ENABLE_I18N */
1650
1651 /* If the state has the halt node, the state is a halt state. */
1652 if (type == END_OF_RE)
1653 newstate->halt = 1;
1654 else if (type == OP_BACK_REF)
1655 newstate->has_backref = 1;
1656 else if (type == ANCHOR || node->constraint)
1657 newstate->has_constraint = 1;
1658 }
1659 err = register_state (dfa, newstate, hash);
1660 if (BE (err != REG_NOERROR, 0))
1661 {
1662 free_state (newstate);
1663 newstate = NULL;
1664 }
1665 return newstate;
1666 }
1667
1668 /* Create the new state which is depend on the context CONTEXT.
1669 Return the new state if succeeded, otherwise return NULL. */
1670
1671 static re_dfastate_t *
1672 internal_function __attribute_warn_unused_result__
1673 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1674 unsigned int context, re_hashval_t hash)
1675 {
1676 Idx i, nctx_nodes = 0;
1677 reg_errcode_t err;
1678 re_dfastate_t *newstate;
1679
1680 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1681 if (BE (newstate == NULL, 0))
1682 return NULL;
1683 err = re_node_set_init_copy (&newstate->nodes, nodes);
1684 if (BE (err != REG_NOERROR, 0))
1685 {
1686 re_free (newstate);
1687 return NULL;
1688 }
1689
1690 newstate->context = context;
1691 newstate->entrance_nodes = &newstate->nodes;
1692
1693 for (i = 0 ; i < nodes->nelem ; i++)
1694 {
1695 re_token_t *node = dfa->nodes + nodes->elems[i];
1696 re_token_type_t type = node->type;
1697 unsigned int constraint = node->constraint;
1698
1699 if (type == CHARACTER && !constraint)
1700 continue;
1701 #ifdef RE_ENABLE_I18N
1702 newstate->accept_mb |= node->accept_mb;
1703 #endif /* RE_ENABLE_I18N */
1704
1705 /* If the state has the halt node, the state is a halt state. */
1706 if (type == END_OF_RE)
1707 newstate->halt = 1;
1708 else if (type == OP_BACK_REF)
1709 newstate->has_backref = 1;
1710
1711 if (constraint)
1712 {
1713 if (newstate->entrance_nodes == &newstate->nodes)
1714 {
1715 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1716 if (BE (newstate->entrance_nodes == NULL, 0))
1717 {
1718 free_state (newstate);
1719 return NULL;
1720 }
1721 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1722 != REG_NOERROR)
1723 return NULL;
1724 nctx_nodes = 0;
1725 newstate->has_constraint = 1;
1726 }
1727
1728 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1729 {
1730 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1731 ++nctx_nodes;
1732 }
1733 }
1734 }
1735 err = register_state (dfa, newstate, hash);
1736 if (BE (err != REG_NOERROR, 0))
1737 {
1738 free_state (newstate);
1739 newstate = NULL;
1740 }
1741 return newstate;
1742 }