defsubst
[bpt/guile.git] / lib / regex_internal.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 static void re_string_construct_common (const char *str, Idx len,
21 re_string_t *pstr,
22 RE_TRANSLATE_TYPE trans, bool icase,
23 const re_dfa_t *dfa) internal_function;
24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
25 const re_node_set *nodes,
26 re_hashval_t hash) internal_function;
27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
28 const re_node_set *nodes,
29 unsigned int context,
30 re_hashval_t hash) internal_function;
31 \f
32 /* Functions for string operation. */
33
34 /* This function allocate the buffers. It is necessary to call
35 re_string_reconstruct before using the object. */
36
37 static reg_errcode_t
38 internal_function __attribute_warn_unused_result__
39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
40 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
41 {
42 reg_errcode_t ret;
43 Idx init_buf_len;
44
45 /* Ensure at least one character fits into the buffers. */
46 if (init_len < dfa->mb_cur_max)
47 init_len = dfa->mb_cur_max;
48 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
49 re_string_construct_common (str, len, pstr, trans, icase, dfa);
50
51 ret = re_string_realloc_buffers (pstr, init_buf_len);
52 if (BE (ret != REG_NOERROR, 0))
53 return ret;
54
55 pstr->word_char = dfa->word_char;
56 pstr->word_ops_used = dfa->word_ops_used;
57 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
58 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
59 pstr->valid_raw_len = pstr->valid_len;
60 return REG_NOERROR;
61 }
62
63 /* This function allocate the buffers, and initialize them. */
64
65 static reg_errcode_t
66 internal_function __attribute_warn_unused_result__
67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
68 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
69 {
70 reg_errcode_t ret;
71 memset (pstr, '\0', sizeof (re_string_t));
72 re_string_construct_common (str, len, pstr, trans, icase, dfa);
73
74 if (len > 0)
75 {
76 ret = re_string_realloc_buffers (pstr, len + 1);
77 if (BE (ret != REG_NOERROR, 0))
78 return ret;
79 }
80 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
81
82 if (icase)
83 {
84 #ifdef RE_ENABLE_I18N
85 if (dfa->mb_cur_max > 1)
86 {
87 while (1)
88 {
89 ret = build_wcs_upper_buffer (pstr);
90 if (BE (ret != REG_NOERROR, 0))
91 return ret;
92 if (pstr->valid_raw_len >= len)
93 break;
94 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
95 break;
96 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
97 if (BE (ret != REG_NOERROR, 0))
98 return ret;
99 }
100 }
101 else
102 #endif /* RE_ENABLE_I18N */
103 build_upper_buffer (pstr);
104 }
105 else
106 {
107 #ifdef RE_ENABLE_I18N
108 if (dfa->mb_cur_max > 1)
109 build_wcs_buffer (pstr);
110 else
111 #endif /* RE_ENABLE_I18N */
112 {
113 if (trans != NULL)
114 re_string_translate_buffer (pstr);
115 else
116 {
117 pstr->valid_len = pstr->bufs_len;
118 pstr->valid_raw_len = pstr->bufs_len;
119 }
120 }
121 }
122
123 return REG_NOERROR;
124 }
125
126 /* Helper functions for re_string_allocate, and re_string_construct. */
127
128 static reg_errcode_t
129 internal_function __attribute_warn_unused_result__
130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
131 {
132 #ifdef RE_ENABLE_I18N
133 if (pstr->mb_cur_max > 1)
134 {
135 wint_t *new_wcs;
136
137 /* Avoid overflow in realloc. */
138 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
139 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
140 return REG_ESPACE;
141
142 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
143 if (BE (new_wcs == NULL, 0))
144 return REG_ESPACE;
145 pstr->wcs = new_wcs;
146 if (pstr->offsets != NULL)
147 {
148 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
149 if (BE (new_offsets == NULL, 0))
150 return REG_ESPACE;
151 pstr->offsets = new_offsets;
152 }
153 }
154 #endif /* RE_ENABLE_I18N */
155 if (pstr->mbs_allocated)
156 {
157 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
158 new_buf_len);
159 if (BE (new_mbs == NULL, 0))
160 return REG_ESPACE;
161 pstr->mbs = new_mbs;
162 }
163 pstr->bufs_len = new_buf_len;
164 return REG_NOERROR;
165 }
166
167
168 static void
169 internal_function
170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
171 RE_TRANSLATE_TYPE trans, bool icase,
172 const re_dfa_t *dfa)
173 {
174 pstr->raw_mbs = (const unsigned char *) str;
175 pstr->len = len;
176 pstr->raw_len = len;
177 pstr->trans = trans;
178 pstr->icase = icase;
179 pstr->mbs_allocated = (trans != NULL || icase);
180 pstr->mb_cur_max = dfa->mb_cur_max;
181 pstr->is_utf8 = dfa->is_utf8;
182 pstr->map_notascii = dfa->map_notascii;
183 pstr->stop = pstr->len;
184 pstr->raw_stop = pstr->stop;
185 }
186
187 #ifdef RE_ENABLE_I18N
188
189 /* Build wide character buffer PSTR->WCS.
190 If the byte sequence of the string are:
191 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
192 Then wide character buffer will be:
193 <wc1> , WEOF , <wc2> , WEOF , <wc3>
194 We use WEOF for padding, they indicate that the position isn't
195 a first byte of a multibyte character.
196
197 Note that this function assumes PSTR->VALID_LEN elements are already
198 built and starts from PSTR->VALID_LEN. */
199
200 static void
201 internal_function
202 build_wcs_buffer (re_string_t *pstr)
203 {
204 #ifdef _LIBC
205 unsigned char buf[MB_LEN_MAX];
206 assert (MB_LEN_MAX >= pstr->mb_cur_max);
207 #else
208 unsigned char buf[64];
209 #endif
210 mbstate_t prev_st;
211 Idx byte_idx, end_idx, remain_len;
212 size_t mbclen;
213
214 /* Build the buffers from pstr->valid_len to either pstr->len or
215 pstr->bufs_len. */
216 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
217 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
218 {
219 wchar_t wc;
220 const char *p;
221
222 remain_len = end_idx - byte_idx;
223 prev_st = pstr->cur_state;
224 /* Apply the translation if we need. */
225 if (BE (pstr->trans != NULL, 0))
226 {
227 int i, ch;
228
229 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
230 {
231 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
232 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
233 }
234 p = (const char *) buf;
235 }
236 else
237 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
238 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
239 if (BE (mbclen == (size_t) -1 || mbclen == 0
240 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
241 {
242 /* We treat these cases as a singlebyte character. */
243 mbclen = 1;
244 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245 if (BE (pstr->trans != NULL, 0))
246 wc = pstr->trans[wc];
247 pstr->cur_state = prev_st;
248 }
249 else if (BE (mbclen == (size_t) -2, 0))
250 {
251 /* The buffer doesn't have enough space, finish to build. */
252 pstr->cur_state = prev_st;
253 break;
254 }
255
256 /* Write wide character and padding. */
257 pstr->wcs[byte_idx++] = wc;
258 /* Write paddings. */
259 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
260 pstr->wcs[byte_idx++] = WEOF;
261 }
262 pstr->valid_len = byte_idx;
263 pstr->valid_raw_len = byte_idx;
264 }
265
266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
267 but for REG_ICASE. */
268
269 static reg_errcode_t
270 internal_function __attribute_warn_unused_result__
271 build_wcs_upper_buffer (re_string_t *pstr)
272 {
273 mbstate_t prev_st;
274 Idx src_idx, byte_idx, end_idx, remain_len;
275 size_t mbclen;
276 #ifdef _LIBC
277 char buf[MB_LEN_MAX];
278 assert (MB_LEN_MAX >= pstr->mb_cur_max);
279 #else
280 char buf[64];
281 #endif
282
283 byte_idx = pstr->valid_len;
284 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
285
286 /* The following optimization assumes that ASCII characters can be
287 mapped to wide characters with a simple cast. */
288 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
289 {
290 while (byte_idx < end_idx)
291 {
292 wchar_t wc;
293
294 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
295 && mbsinit (&pstr->cur_state))
296 {
297 /* In case of a singlebyte character. */
298 pstr->mbs[byte_idx]
299 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
300 /* The next step uses the assumption that wchar_t is encoded
301 ASCII-safe: all ASCII values can be converted like this. */
302 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
303 ++byte_idx;
304 continue;
305 }
306
307 remain_len = end_idx - byte_idx;
308 prev_st = pstr->cur_state;
309 mbclen = __mbrtowc (&wc,
310 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
311 + byte_idx), remain_len, &pstr->cur_state);
312 if (BE (mbclen < (size_t) -2, 1))
313 {
314 wchar_t wcu = wc;
315 if (iswlower (wc))
316 {
317 size_t mbcdlen;
318
319 wcu = towupper (wc);
320 mbcdlen = wcrtomb (buf, wcu, &prev_st);
321 if (BE (mbclen == mbcdlen, 1))
322 memcpy (pstr->mbs + byte_idx, buf, mbclen);
323 else
324 {
325 src_idx = byte_idx;
326 goto offsets_needed;
327 }
328 }
329 else
330 memcpy (pstr->mbs + byte_idx,
331 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
332 pstr->wcs[byte_idx++] = wcu;
333 /* Write paddings. */
334 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
335 pstr->wcs[byte_idx++] = WEOF;
336 }
337 else if (mbclen == (size_t) -1 || mbclen == 0
338 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
339 {
340 /* It is an invalid character, an incomplete character
341 at the end of the string, or '\0'. Just use the byte. */
342 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
343 pstr->mbs[byte_idx] = ch;
344 /* And also cast it to wide char. */
345 pstr->wcs[byte_idx++] = (wchar_t) ch;
346 if (BE (mbclen == (size_t) -1, 0))
347 pstr->cur_state = prev_st;
348 }
349 else
350 {
351 /* The buffer doesn't have enough space, finish to build. */
352 pstr->cur_state = prev_st;
353 break;
354 }
355 }
356 pstr->valid_len = byte_idx;
357 pstr->valid_raw_len = byte_idx;
358 return REG_NOERROR;
359 }
360 else
361 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
362 {
363 wchar_t wc;
364 const char *p;
365 offsets_needed:
366 remain_len = end_idx - byte_idx;
367 prev_st = pstr->cur_state;
368 if (BE (pstr->trans != NULL, 0))
369 {
370 int i, ch;
371
372 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
373 {
374 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
375 buf[i] = pstr->trans[ch];
376 }
377 p = (const char *) buf;
378 }
379 else
380 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
381 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
382 if (BE (mbclen < (size_t) -2, 1))
383 {
384 wchar_t wcu = wc;
385 if (iswlower (wc))
386 {
387 size_t mbcdlen;
388
389 wcu = towupper (wc);
390 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
391 if (BE (mbclen == mbcdlen, 1))
392 memcpy (pstr->mbs + byte_idx, buf, mbclen);
393 else if (mbcdlen != (size_t) -1)
394 {
395 size_t i;
396
397 if (byte_idx + mbcdlen > pstr->bufs_len)
398 {
399 pstr->cur_state = prev_st;
400 break;
401 }
402
403 if (pstr->offsets == NULL)
404 {
405 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
406
407 if (pstr->offsets == NULL)
408 return REG_ESPACE;
409 }
410 if (!pstr->offsets_needed)
411 {
412 for (i = 0; i < (size_t) byte_idx; ++i)
413 pstr->offsets[i] = i;
414 pstr->offsets_needed = 1;
415 }
416
417 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
418 pstr->wcs[byte_idx] = wcu;
419 pstr->offsets[byte_idx] = src_idx;
420 for (i = 1; i < mbcdlen; ++i)
421 {
422 pstr->offsets[byte_idx + i]
423 = src_idx + (i < mbclen ? i : mbclen - 1);
424 pstr->wcs[byte_idx + i] = WEOF;
425 }
426 pstr->len += mbcdlen - mbclen;
427 if (pstr->raw_stop > src_idx)
428 pstr->stop += mbcdlen - mbclen;
429 end_idx = (pstr->bufs_len > pstr->len)
430 ? pstr->len : pstr->bufs_len;
431 byte_idx += mbcdlen;
432 src_idx += mbclen;
433 continue;
434 }
435 else
436 memcpy (pstr->mbs + byte_idx, p, mbclen);
437 }
438 else
439 memcpy (pstr->mbs + byte_idx, p, mbclen);
440
441 if (BE (pstr->offsets_needed != 0, 0))
442 {
443 size_t i;
444 for (i = 0; i < mbclen; ++i)
445 pstr->offsets[byte_idx + i] = src_idx + i;
446 }
447 src_idx += mbclen;
448
449 pstr->wcs[byte_idx++] = wcu;
450 /* Write paddings. */
451 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
452 pstr->wcs[byte_idx++] = WEOF;
453 }
454 else if (mbclen == (size_t) -1 || mbclen == 0
455 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
456 {
457 /* It is an invalid character or '\0'. Just use the byte. */
458 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
459
460 if (BE (pstr->trans != NULL, 0))
461 ch = pstr->trans [ch];
462 pstr->mbs[byte_idx] = ch;
463
464 if (BE (pstr->offsets_needed != 0, 0))
465 pstr->offsets[byte_idx] = src_idx;
466 ++src_idx;
467
468 /* And also cast it to wide char. */
469 pstr->wcs[byte_idx++] = (wchar_t) ch;
470 if (BE (mbclen == (size_t) -1, 0))
471 pstr->cur_state = prev_st;
472 }
473 else
474 {
475 /* The buffer doesn't have enough space, finish to build. */
476 pstr->cur_state = prev_st;
477 break;
478 }
479 }
480 pstr->valid_len = byte_idx;
481 pstr->valid_raw_len = src_idx;
482 return REG_NOERROR;
483 }
484
485 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
486 Return the index. */
487
488 static Idx
489 internal_function
490 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
491 {
492 mbstate_t prev_st;
493 Idx rawbuf_idx;
494 size_t mbclen;
495 wint_t wc = WEOF;
496
497 /* Skip the characters which are not necessary to check. */
498 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
499 rawbuf_idx < new_raw_idx;)
500 {
501 wchar_t wc2;
502 Idx remain_len = pstr->raw_len - rawbuf_idx;
503 prev_st = pstr->cur_state;
504 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
505 remain_len, &pstr->cur_state);
506 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
507 {
508 /* We treat these cases as a single byte character. */
509 if (mbclen == 0 || remain_len == 0)
510 wc = L'\0';
511 else
512 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
513 mbclen = 1;
514 pstr->cur_state = prev_st;
515 }
516 else
517 wc = wc2;
518 /* Then proceed the next character. */
519 rawbuf_idx += mbclen;
520 }
521 *last_wc = wc;
522 return rawbuf_idx;
523 }
524 #endif /* RE_ENABLE_I18N */
525
526 /* Build the buffer PSTR->MBS, and apply the translation if we need.
527 This function is used in case of REG_ICASE. */
528
529 static void
530 internal_function
531 build_upper_buffer (re_string_t *pstr)
532 {
533 Idx char_idx, end_idx;
534 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
535
536 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
537 {
538 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
539 if (BE (pstr->trans != NULL, 0))
540 ch = pstr->trans[ch];
541 if (islower (ch))
542 pstr->mbs[char_idx] = toupper (ch);
543 else
544 pstr->mbs[char_idx] = ch;
545 }
546 pstr->valid_len = char_idx;
547 pstr->valid_raw_len = char_idx;
548 }
549
550 /* Apply TRANS to the buffer in PSTR. */
551
552 static void
553 internal_function
554 re_string_translate_buffer (re_string_t *pstr)
555 {
556 Idx buf_idx, end_idx;
557 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
558
559 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
560 {
561 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
562 pstr->mbs[buf_idx] = pstr->trans[ch];
563 }
564
565 pstr->valid_len = buf_idx;
566 pstr->valid_raw_len = buf_idx;
567 }
568
569 /* This function re-construct the buffers.
570 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
571 convert to upper case in case of REG_ICASE, apply translation. */
572
573 static reg_errcode_t
574 internal_function __attribute_warn_unused_result__
575 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
576 {
577 Idx offset;
578
579 if (BE (pstr->raw_mbs_idx <= idx, 0))
580 offset = idx - pstr->raw_mbs_idx;
581 else
582 {
583 /* Reset buffer. */
584 #ifdef RE_ENABLE_I18N
585 if (pstr->mb_cur_max > 1)
586 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
587 #endif /* RE_ENABLE_I18N */
588 pstr->len = pstr->raw_len;
589 pstr->stop = pstr->raw_stop;
590 pstr->valid_len = 0;
591 pstr->raw_mbs_idx = 0;
592 pstr->valid_raw_len = 0;
593 pstr->offsets_needed = 0;
594 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
595 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
596 if (!pstr->mbs_allocated)
597 pstr->mbs = (unsigned char *) pstr->raw_mbs;
598 offset = idx;
599 }
600
601 if (BE (offset != 0, 1))
602 {
603 /* Should the already checked characters be kept? */
604 if (BE (offset < pstr->valid_raw_len, 1))
605 {
606 /* Yes, move them to the front of the buffer. */
607 #ifdef RE_ENABLE_I18N
608 if (BE (pstr->offsets_needed, 0))
609 {
610 Idx low = 0, high = pstr->valid_len, mid;
611 do
612 {
613 mid = (high + low) / 2;
614 if (pstr->offsets[mid] > offset)
615 high = mid;
616 else if (pstr->offsets[mid] < offset)
617 low = mid + 1;
618 else
619 break;
620 }
621 while (low < high);
622 if (pstr->offsets[mid] < offset)
623 ++mid;
624 pstr->tip_context = re_string_context_at (pstr, mid - 1,
625 eflags);
626 /* This can be quite complicated, so handle specially
627 only the common and easy case where the character with
628 different length representation of lower and upper
629 case is present at or after offset. */
630 if (pstr->valid_len > offset
631 && mid == offset && pstr->offsets[mid] == offset)
632 {
633 memmove (pstr->wcs, pstr->wcs + offset,
634 (pstr->valid_len - offset) * sizeof (wint_t));
635 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
636 pstr->valid_len -= offset;
637 pstr->valid_raw_len -= offset;
638 for (low = 0; low < pstr->valid_len; low++)
639 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
640 }
641 else
642 {
643 /* Otherwise, just find out how long the partial multibyte
644 character at offset is and fill it with WEOF/255. */
645 pstr->len = pstr->raw_len - idx + offset;
646 pstr->stop = pstr->raw_stop - idx + offset;
647 pstr->offsets_needed = 0;
648 while (mid > 0 && pstr->offsets[mid - 1] == offset)
649 --mid;
650 while (mid < pstr->valid_len)
651 if (pstr->wcs[mid] != WEOF)
652 break;
653 else
654 ++mid;
655 if (mid == pstr->valid_len)
656 pstr->valid_len = 0;
657 else
658 {
659 pstr->valid_len = pstr->offsets[mid] - offset;
660 if (pstr->valid_len)
661 {
662 for (low = 0; low < pstr->valid_len; ++low)
663 pstr->wcs[low] = WEOF;
664 memset (pstr->mbs, 255, pstr->valid_len);
665 }
666 }
667 pstr->valid_raw_len = pstr->valid_len;
668 }
669 }
670 else
671 #endif
672 {
673 pstr->tip_context = re_string_context_at (pstr, offset - 1,
674 eflags);
675 #ifdef RE_ENABLE_I18N
676 if (pstr->mb_cur_max > 1)
677 memmove (pstr->wcs, pstr->wcs + offset,
678 (pstr->valid_len - offset) * sizeof (wint_t));
679 #endif /* RE_ENABLE_I18N */
680 if (BE (pstr->mbs_allocated, 0))
681 memmove (pstr->mbs, pstr->mbs + offset,
682 pstr->valid_len - offset);
683 pstr->valid_len -= offset;
684 pstr->valid_raw_len -= offset;
685 #if DEBUG
686 assert (pstr->valid_len > 0);
687 #endif
688 }
689 }
690 else
691 {
692 #ifdef RE_ENABLE_I18N
693 /* No, skip all characters until IDX. */
694 Idx prev_valid_len = pstr->valid_len;
695
696 if (BE (pstr->offsets_needed, 0))
697 {
698 pstr->len = pstr->raw_len - idx + offset;
699 pstr->stop = pstr->raw_stop - idx + offset;
700 pstr->offsets_needed = 0;
701 }
702 #endif
703 pstr->valid_len = 0;
704 #ifdef RE_ENABLE_I18N
705 if (pstr->mb_cur_max > 1)
706 {
707 Idx wcs_idx;
708 wint_t wc = WEOF;
709
710 if (pstr->is_utf8)
711 {
712 const unsigned char *raw, *p, *end;
713
714 /* Special case UTF-8. Multi-byte chars start with any
715 byte other than 0x80 - 0xbf. */
716 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
717 end = raw + (offset - pstr->mb_cur_max);
718 if (end < pstr->raw_mbs)
719 end = pstr->raw_mbs;
720 p = raw + offset - 1;
721 #ifdef _LIBC
722 /* We know the wchar_t encoding is UCS4, so for the simple
723 case, ASCII characters, skip the conversion step. */
724 if (isascii (*p) && BE (pstr->trans == NULL, 1))
725 {
726 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
727 /* pstr->valid_len = 0; */
728 wc = (wchar_t) *p;
729 }
730 else
731 #endif
732 for (; p >= end; --p)
733 if ((*p & 0xc0) != 0x80)
734 {
735 mbstate_t cur_state;
736 wchar_t wc2;
737 Idx mlen = raw + pstr->len - p;
738 unsigned char buf[6];
739 size_t mbclen;
740
741 const unsigned char *pp = p;
742 if (BE (pstr->trans != NULL, 0))
743 {
744 int i = mlen < 6 ? mlen : 6;
745 while (--i >= 0)
746 buf[i] = pstr->trans[p[i]];
747 pp = buf;
748 }
749 /* XXX Don't use mbrtowc, we know which conversion
750 to use (UTF-8 -> UCS4). */
751 memset (&cur_state, 0, sizeof (cur_state));
752 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
753 &cur_state);
754 if (raw + offset - p <= mbclen
755 && mbclen < (size_t) -2)
756 {
757 memset (&pstr->cur_state, '\0',
758 sizeof (mbstate_t));
759 pstr->valid_len = mbclen - (raw + offset - p);
760 wc = wc2;
761 }
762 break;
763 }
764 }
765
766 if (wc == WEOF)
767 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
768 if (wc == WEOF)
769 pstr->tip_context
770 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
771 else
772 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
773 && IS_WIDE_WORD_CHAR (wc))
774 ? CONTEXT_WORD
775 : ((IS_WIDE_NEWLINE (wc)
776 && pstr->newline_anchor)
777 ? CONTEXT_NEWLINE : 0));
778 if (BE (pstr->valid_len, 0))
779 {
780 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
781 pstr->wcs[wcs_idx] = WEOF;
782 if (pstr->mbs_allocated)
783 memset (pstr->mbs, 255, pstr->valid_len);
784 }
785 pstr->valid_raw_len = pstr->valid_len;
786 }
787 else
788 #endif /* RE_ENABLE_I18N */
789 {
790 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
791 pstr->valid_raw_len = 0;
792 if (pstr->trans)
793 c = pstr->trans[c];
794 pstr->tip_context = (bitset_contain (pstr->word_char, c)
795 ? CONTEXT_WORD
796 : ((IS_NEWLINE (c) && pstr->newline_anchor)
797 ? CONTEXT_NEWLINE : 0));
798 }
799 }
800 if (!BE (pstr->mbs_allocated, 0))
801 pstr->mbs += offset;
802 }
803 pstr->raw_mbs_idx = idx;
804 pstr->len -= offset;
805 pstr->stop -= offset;
806
807 /* Then build the buffers. */
808 #ifdef RE_ENABLE_I18N
809 if (pstr->mb_cur_max > 1)
810 {
811 if (pstr->icase)
812 {
813 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
814 if (BE (ret != REG_NOERROR, 0))
815 return ret;
816 }
817 else
818 build_wcs_buffer (pstr);
819 }
820 else
821 #endif /* RE_ENABLE_I18N */
822 if (BE (pstr->mbs_allocated, 0))
823 {
824 if (pstr->icase)
825 build_upper_buffer (pstr);
826 else if (pstr->trans != NULL)
827 re_string_translate_buffer (pstr);
828 }
829 else
830 pstr->valid_len = pstr->len;
831
832 pstr->cur_idx = 0;
833 return REG_NOERROR;
834 }
835
836 static unsigned char
837 internal_function __attribute__ ((pure))
838 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
839 {
840 int ch;
841 Idx off;
842
843 /* Handle the common (easiest) cases first. */
844 if (BE (!pstr->mbs_allocated, 1))
845 return re_string_peek_byte (pstr, idx);
846
847 #ifdef RE_ENABLE_I18N
848 if (pstr->mb_cur_max > 1
849 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
850 return re_string_peek_byte (pstr, idx);
851 #endif
852
853 off = pstr->cur_idx + idx;
854 #ifdef RE_ENABLE_I18N
855 if (pstr->offsets_needed)
856 off = pstr->offsets[off];
857 #endif
858
859 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
860
861 #ifdef RE_ENABLE_I18N
862 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
863 this function returns CAPITAL LETTER I instead of first byte of
864 DOTLESS SMALL LETTER I. The latter would confuse the parser,
865 since peek_byte_case doesn't advance cur_idx in any way. */
866 if (pstr->offsets_needed && !isascii (ch))
867 return re_string_peek_byte (pstr, idx);
868 #endif
869
870 return ch;
871 }
872
873 static unsigned char
874 internal_function
875 re_string_fetch_byte_case (re_string_t *pstr)
876 {
877 if (BE (!pstr->mbs_allocated, 1))
878 return re_string_fetch_byte (pstr);
879
880 #ifdef RE_ENABLE_I18N
881 if (pstr->offsets_needed)
882 {
883 Idx off;
884 int ch;
885
886 /* For tr_TR.UTF-8 [[:islower:]] there is
887 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
888 in that case the whole multi-byte character and return
889 the original letter. On the other side, with
890 [[: DOTLESS SMALL LETTER I return [[:I, as doing
891 anything else would complicate things too much. */
892
893 if (!re_string_first_byte (pstr, pstr->cur_idx))
894 return re_string_fetch_byte (pstr);
895
896 off = pstr->offsets[pstr->cur_idx];
897 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
898
899 if (! isascii (ch))
900 return re_string_fetch_byte (pstr);
901
902 re_string_skip_bytes (pstr,
903 re_string_char_size_at (pstr, pstr->cur_idx));
904 return ch;
905 }
906 #endif
907
908 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
909 }
910
911 static void
912 internal_function
913 re_string_destruct (re_string_t *pstr)
914 {
915 #ifdef RE_ENABLE_I18N
916 re_free (pstr->wcs);
917 re_free (pstr->offsets);
918 #endif /* RE_ENABLE_I18N */
919 if (pstr->mbs_allocated)
920 re_free (pstr->mbs);
921 }
922
923 /* Return the context at IDX in INPUT. */
924
925 static unsigned int
926 internal_function
927 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
928 {
929 int c;
930 if (BE (! REG_VALID_INDEX (idx), 0))
931 /* In this case, we use the value stored in input->tip_context,
932 since we can't know the character in input->mbs[-1] here. */
933 return input->tip_context;
934 if (BE (idx == input->len, 0))
935 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
936 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
937 #ifdef RE_ENABLE_I18N
938 if (input->mb_cur_max > 1)
939 {
940 wint_t wc;
941 Idx wc_idx = idx;
942 while(input->wcs[wc_idx] == WEOF)
943 {
944 #ifdef DEBUG
945 /* It must not happen. */
946 assert (REG_VALID_INDEX (wc_idx));
947 #endif
948 --wc_idx;
949 if (! REG_VALID_INDEX (wc_idx))
950 return input->tip_context;
951 }
952 wc = input->wcs[wc_idx];
953 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
954 return CONTEXT_WORD;
955 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
956 ? CONTEXT_NEWLINE : 0);
957 }
958 else
959 #endif
960 {
961 c = re_string_byte_at (input, idx);
962 if (bitset_contain (input->word_char, c))
963 return CONTEXT_WORD;
964 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
965 }
966 }
967 \f
968 /* Functions for set operation. */
969
970 static reg_errcode_t
971 internal_function __attribute_warn_unused_result__
972 re_node_set_alloc (re_node_set *set, Idx size)
973 {
974 set->alloc = size;
975 set->nelem = 0;
976 set->elems = re_malloc (Idx, size);
977 if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
978 return REG_ESPACE;
979 return REG_NOERROR;
980 }
981
982 static reg_errcode_t
983 internal_function __attribute_warn_unused_result__
984 re_node_set_init_1 (re_node_set *set, Idx elem)
985 {
986 set->alloc = 1;
987 set->nelem = 1;
988 set->elems = re_malloc (Idx, 1);
989 if (BE (set->elems == NULL, 0))
990 {
991 set->alloc = set->nelem = 0;
992 return REG_ESPACE;
993 }
994 set->elems[0] = elem;
995 return REG_NOERROR;
996 }
997
998 static reg_errcode_t
999 internal_function __attribute_warn_unused_result__
1000 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1001 {
1002 set->alloc = 2;
1003 set->elems = re_malloc (Idx, 2);
1004 if (BE (set->elems == NULL, 0))
1005 return REG_ESPACE;
1006 if (elem1 == elem2)
1007 {
1008 set->nelem = 1;
1009 set->elems[0] = elem1;
1010 }
1011 else
1012 {
1013 set->nelem = 2;
1014 if (elem1 < elem2)
1015 {
1016 set->elems[0] = elem1;
1017 set->elems[1] = elem2;
1018 }
1019 else
1020 {
1021 set->elems[0] = elem2;
1022 set->elems[1] = elem1;
1023 }
1024 }
1025 return REG_NOERROR;
1026 }
1027
1028 static reg_errcode_t
1029 internal_function __attribute_warn_unused_result__
1030 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1031 {
1032 dest->nelem = src->nelem;
1033 if (src->nelem > 0)
1034 {
1035 dest->alloc = dest->nelem;
1036 dest->elems = re_malloc (Idx, dest->alloc);
1037 if (BE (dest->elems == NULL, 0))
1038 {
1039 dest->alloc = dest->nelem = 0;
1040 return REG_ESPACE;
1041 }
1042 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1043 }
1044 else
1045 re_node_set_init_empty (dest);
1046 return REG_NOERROR;
1047 }
1048
1049 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1050 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1051 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1052
1053 static reg_errcode_t
1054 internal_function __attribute_warn_unused_result__
1055 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1056 const re_node_set *src2)
1057 {
1058 Idx i1, i2, is, id, delta, sbase;
1059 if (src1->nelem == 0 || src2->nelem == 0)
1060 return REG_NOERROR;
1061
1062 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1063 conservative estimate. */
1064 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1065 {
1066 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1067 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1068 if (BE (new_elems == NULL, 0))
1069 return REG_ESPACE;
1070 dest->elems = new_elems;
1071 dest->alloc = new_alloc;
1072 }
1073
1074 /* Find the items in the intersection of SRC1 and SRC2, and copy
1075 into the top of DEST those that are not already in DEST itself. */
1076 sbase = dest->nelem + src1->nelem + src2->nelem;
1077 i1 = src1->nelem - 1;
1078 i2 = src2->nelem - 1;
1079 id = dest->nelem - 1;
1080 for (;;)
1081 {
1082 if (src1->elems[i1] == src2->elems[i2])
1083 {
1084 /* Try to find the item in DEST. Maybe we could binary search? */
1085 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1086 --id;
1087
1088 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1089 dest->elems[--sbase] = src1->elems[i1];
1090
1091 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1092 break;
1093 }
1094
1095 /* Lower the highest of the two items. */
1096 else if (src1->elems[i1] < src2->elems[i2])
1097 {
1098 if (! REG_VALID_INDEX (--i2))
1099 break;
1100 }
1101 else
1102 {
1103 if (! REG_VALID_INDEX (--i1))
1104 break;
1105 }
1106 }
1107
1108 id = dest->nelem - 1;
1109 is = dest->nelem + src1->nelem + src2->nelem - 1;
1110 delta = is - sbase + 1;
1111
1112 /* Now copy. When DELTA becomes zero, the remaining
1113 DEST elements are already in place; this is more or
1114 less the same loop that is in re_node_set_merge. */
1115 dest->nelem += delta;
1116 if (delta > 0 && REG_VALID_INDEX (id))
1117 for (;;)
1118 {
1119 if (dest->elems[is] > dest->elems[id])
1120 {
1121 /* Copy from the top. */
1122 dest->elems[id + delta--] = dest->elems[is--];
1123 if (delta == 0)
1124 break;
1125 }
1126 else
1127 {
1128 /* Slide from the bottom. */
1129 dest->elems[id + delta] = dest->elems[id];
1130 if (! REG_VALID_INDEX (--id))
1131 break;
1132 }
1133 }
1134
1135 /* Copy remaining SRC elements. */
1136 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1137
1138 return REG_NOERROR;
1139 }
1140
1141 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1142 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1143
1144 static reg_errcode_t
1145 internal_function __attribute_warn_unused_result__
1146 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1147 const re_node_set *src2)
1148 {
1149 Idx i1, i2, id;
1150 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1151 {
1152 dest->alloc = src1->nelem + src2->nelem;
1153 dest->elems = re_malloc (Idx, dest->alloc);
1154 if (BE (dest->elems == NULL, 0))
1155 return REG_ESPACE;
1156 }
1157 else
1158 {
1159 if (src1 != NULL && src1->nelem > 0)
1160 return re_node_set_init_copy (dest, src1);
1161 else if (src2 != NULL && src2->nelem > 0)
1162 return re_node_set_init_copy (dest, src2);
1163 else
1164 re_node_set_init_empty (dest);
1165 return REG_NOERROR;
1166 }
1167 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1168 {
1169 if (src1->elems[i1] > src2->elems[i2])
1170 {
1171 dest->elems[id++] = src2->elems[i2++];
1172 continue;
1173 }
1174 if (src1->elems[i1] == src2->elems[i2])
1175 ++i2;
1176 dest->elems[id++] = src1->elems[i1++];
1177 }
1178 if (i1 < src1->nelem)
1179 {
1180 memcpy (dest->elems + id, src1->elems + i1,
1181 (src1->nelem - i1) * sizeof (Idx));
1182 id += src1->nelem - i1;
1183 }
1184 else if (i2 < src2->nelem)
1185 {
1186 memcpy (dest->elems + id, src2->elems + i2,
1187 (src2->nelem - i2) * sizeof (Idx));
1188 id += src2->nelem - i2;
1189 }
1190 dest->nelem = id;
1191 return REG_NOERROR;
1192 }
1193
1194 /* Calculate the union set of the sets DEST and SRC. And store it to
1195 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1196
1197 static reg_errcode_t
1198 internal_function __attribute_warn_unused_result__
1199 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1200 {
1201 Idx is, id, sbase, delta;
1202 if (src == NULL || src->nelem == 0)
1203 return REG_NOERROR;
1204 if (dest->alloc < 2 * src->nelem + dest->nelem)
1205 {
1206 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1207 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1208 if (BE (new_buffer == NULL, 0))
1209 return REG_ESPACE;
1210 dest->elems = new_buffer;
1211 dest->alloc = new_alloc;
1212 }
1213
1214 if (BE (dest->nelem == 0, 0))
1215 {
1216 dest->nelem = src->nelem;
1217 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1218 return REG_NOERROR;
1219 }
1220
1221 /* Copy into the top of DEST the items of SRC that are not
1222 found in DEST. Maybe we could binary search in DEST? */
1223 for (sbase = dest->nelem + 2 * src->nelem,
1224 is = src->nelem - 1, id = dest->nelem - 1;
1225 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1226 {
1227 if (dest->elems[id] == src->elems[is])
1228 is--, id--;
1229 else if (dest->elems[id] < src->elems[is])
1230 dest->elems[--sbase] = src->elems[is--];
1231 else /* if (dest->elems[id] > src->elems[is]) */
1232 --id;
1233 }
1234
1235 if (REG_VALID_INDEX (is))
1236 {
1237 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1238 sbase -= is + 1;
1239 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1240 }
1241
1242 id = dest->nelem - 1;
1243 is = dest->nelem + 2 * src->nelem - 1;
1244 delta = is - sbase + 1;
1245 if (delta == 0)
1246 return REG_NOERROR;
1247
1248 /* Now copy. When DELTA becomes zero, the remaining
1249 DEST elements are already in place. */
1250 dest->nelem += delta;
1251 for (;;)
1252 {
1253 if (dest->elems[is] > dest->elems[id])
1254 {
1255 /* Copy from the top. */
1256 dest->elems[id + delta--] = dest->elems[is--];
1257 if (delta == 0)
1258 break;
1259 }
1260 else
1261 {
1262 /* Slide from the bottom. */
1263 dest->elems[id + delta] = dest->elems[id];
1264 if (! REG_VALID_INDEX (--id))
1265 {
1266 /* Copy remaining SRC elements. */
1267 memcpy (dest->elems, dest->elems + sbase,
1268 delta * sizeof (Idx));
1269 break;
1270 }
1271 }
1272 }
1273
1274 return REG_NOERROR;
1275 }
1276
1277 /* Insert the new element ELEM to the re_node_set* SET.
1278 SET should not already have ELEM.
1279 Return true if successful. */
1280
1281 static bool
1282 internal_function __attribute_warn_unused_result__
1283 re_node_set_insert (re_node_set *set, Idx elem)
1284 {
1285 Idx idx;
1286 /* In case the set is empty. */
1287 if (set->alloc == 0)
1288 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1289
1290 if (BE (set->nelem, 0) == 0)
1291 {
1292 /* We already guaranteed above that set->alloc != 0. */
1293 set->elems[0] = elem;
1294 ++set->nelem;
1295 return true;
1296 }
1297
1298 /* Realloc if we need. */
1299 if (set->alloc == set->nelem)
1300 {
1301 Idx *new_elems;
1302 set->alloc = set->alloc * 2;
1303 new_elems = re_realloc (set->elems, Idx, set->alloc);
1304 if (BE (new_elems == NULL, 0))
1305 return false;
1306 set->elems = new_elems;
1307 }
1308
1309 /* Move the elements which follows the new element. Test the
1310 first element separately to skip a check in the inner loop. */
1311 if (elem < set->elems[0])
1312 {
1313 idx = 0;
1314 for (idx = set->nelem; idx > 0; idx--)
1315 set->elems[idx] = set->elems[idx - 1];
1316 }
1317 else
1318 {
1319 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1320 set->elems[idx] = set->elems[idx - 1];
1321 }
1322
1323 /* Insert the new element. */
1324 set->elems[idx] = elem;
1325 ++set->nelem;
1326 return true;
1327 }
1328
1329 /* Insert the new element ELEM to the re_node_set* SET.
1330 SET should not already have any element greater than or equal to ELEM.
1331 Return true if successful. */
1332
1333 static bool
1334 internal_function __attribute_warn_unused_result__
1335 re_node_set_insert_last (re_node_set *set, Idx elem)
1336 {
1337 /* Realloc if we need. */
1338 if (set->alloc == set->nelem)
1339 {
1340 Idx *new_elems;
1341 set->alloc = (set->alloc + 1) * 2;
1342 new_elems = re_realloc (set->elems, Idx, set->alloc);
1343 if (BE (new_elems == NULL, 0))
1344 return false;
1345 set->elems = new_elems;
1346 }
1347
1348 /* Insert the new element. */
1349 set->elems[set->nelem++] = elem;
1350 return true;
1351 }
1352
1353 /* Compare two node sets SET1 and SET2.
1354 Return true if SET1 and SET2 are equivalent. */
1355
1356 static bool
1357 internal_function __attribute__ ((pure))
1358 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1359 {
1360 Idx i;
1361 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1362 return false;
1363 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1364 if (set1->elems[i] != set2->elems[i])
1365 return false;
1366 return true;
1367 }
1368
1369 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1370
1371 static Idx
1372 internal_function __attribute__ ((pure))
1373 re_node_set_contains (const re_node_set *set, Idx elem)
1374 {
1375 __re_size_t idx, right, mid;
1376 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1377 return 0;
1378
1379 /* Binary search the element. */
1380 idx = 0;
1381 right = set->nelem - 1;
1382 while (idx < right)
1383 {
1384 mid = (idx + right) / 2;
1385 if (set->elems[mid] < elem)
1386 idx = mid + 1;
1387 else
1388 right = mid;
1389 }
1390 return set->elems[idx] == elem ? idx + 1 : 0;
1391 }
1392
1393 static void
1394 internal_function
1395 re_node_set_remove_at (re_node_set *set, Idx idx)
1396 {
1397 if (idx < 0 || idx >= set->nelem)
1398 return;
1399 --set->nelem;
1400 for (; idx < set->nelem; idx++)
1401 set->elems[idx] = set->elems[idx + 1];
1402 }
1403 \f
1404
1405 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406 Or return REG_MISSING if an error occurred. */
1407
1408 static Idx
1409 internal_function
1410 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1411 {
1412 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1413 {
1414 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1415 Idx *new_nexts, *new_indices;
1416 re_node_set *new_edests, *new_eclosures;
1417 re_token_t *new_nodes;
1418
1419 /* Avoid overflows in realloc. */
1420 const size_t max_object_size = MAX (sizeof (re_token_t),
1421 MAX (sizeof (re_node_set),
1422 sizeof (Idx)));
1423 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1424 return REG_MISSING;
1425
1426 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427 if (BE (new_nodes == NULL, 0))
1428 return REG_MISSING;
1429 dfa->nodes = new_nodes;
1430 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1431 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1432 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434 if (BE (new_nexts == NULL || new_indices == NULL
1435 || new_edests == NULL || new_eclosures == NULL, 0))
1436 return REG_MISSING;
1437 dfa->nexts = new_nexts;
1438 dfa->org_indices = new_indices;
1439 dfa->edests = new_edests;
1440 dfa->eclosures = new_eclosures;
1441 dfa->nodes_alloc = new_nodes_alloc;
1442 }
1443 dfa->nodes[dfa->nodes_len] = token;
1444 dfa->nodes[dfa->nodes_len].constraint = 0;
1445 #ifdef RE_ENABLE_I18N
1446 dfa->nodes[dfa->nodes_len].accept_mb =
1447 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448 || token.type == COMPLEX_BRACKET);
1449 #endif
1450 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1451 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453 return dfa->nodes_len++;
1454 }
1455
1456 static re_hashval_t
1457 internal_function
1458 calc_state_hash (const re_node_set *nodes, unsigned int context)
1459 {
1460 re_hashval_t hash = nodes->nelem + context;
1461 Idx i;
1462 for (i = 0 ; i < nodes->nelem ; i++)
1463 hash += nodes->elems[i];
1464 return hash;
1465 }
1466
1467 /* Search for the state whose node_set is equivalent to NODES.
1468 Return the pointer to the state, if we found it in the DFA.
1469 Otherwise create the new one and return it. In case of an error
1470 return NULL and set the error code in ERR.
1471 Note: - We assume NULL as the invalid state, then it is possible that
1472 return value is NULL and ERR is REG_NOERROR.
1473 - We never return non-NULL value in case of any errors, it is for
1474 optimization. */
1475
1476 static re_dfastate_t *
1477 internal_function __attribute_warn_unused_result__
1478 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479 const re_node_set *nodes)
1480 {
1481 re_hashval_t hash;
1482 re_dfastate_t *new_state;
1483 struct re_state_table_entry *spot;
1484 Idx i;
1485 #ifdef lint
1486 /* Suppress bogus uninitialized-variable warnings. */
1487 *err = REG_NOERROR;
1488 #endif
1489 if (BE (nodes->nelem == 0, 0))
1490 {
1491 *err = REG_NOERROR;
1492 return NULL;
1493 }
1494 hash = calc_state_hash (nodes, 0);
1495 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496
1497 for (i = 0 ; i < spot->num ; i++)
1498 {
1499 re_dfastate_t *state = spot->array[i];
1500 if (hash != state->hash)
1501 continue;
1502 if (re_node_set_compare (&state->nodes, nodes))
1503 return state;
1504 }
1505
1506 /* There are no appropriate state in the dfa, create the new one. */
1507 new_state = create_ci_newstate (dfa, nodes, hash);
1508 if (BE (new_state == NULL, 0))
1509 *err = REG_ESPACE;
1510
1511 return new_state;
1512 }
1513
1514 /* Search for the state whose node_set is equivalent to NODES and
1515 whose context is equivalent to CONTEXT.
1516 Return the pointer to the state, if we found it in the DFA.
1517 Otherwise create the new one and return it. In case of an error
1518 return NULL and set the error code in ERR.
1519 Note: - We assume NULL as the invalid state, then it is possible that
1520 return value is NULL and ERR is REG_NOERROR.
1521 - We never return non-NULL value in case of any errors, it is for
1522 optimization. */
1523
1524 static re_dfastate_t *
1525 internal_function __attribute_warn_unused_result__
1526 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1527 const re_node_set *nodes, unsigned int context)
1528 {
1529 re_hashval_t hash;
1530 re_dfastate_t *new_state;
1531 struct re_state_table_entry *spot;
1532 Idx i;
1533 #ifdef lint
1534 /* Suppress bogus uninitialized-variable warnings. */
1535 *err = REG_NOERROR;
1536 #endif
1537 if (nodes->nelem == 0)
1538 {
1539 *err = REG_NOERROR;
1540 return NULL;
1541 }
1542 hash = calc_state_hash (nodes, context);
1543 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544
1545 for (i = 0 ; i < spot->num ; i++)
1546 {
1547 re_dfastate_t *state = spot->array[i];
1548 if (state->hash == hash
1549 && state->context == context
1550 && re_node_set_compare (state->entrance_nodes, nodes))
1551 return state;
1552 }
1553 /* There are no appropriate state in 'dfa', create the new one. */
1554 new_state = create_cd_newstate (dfa, nodes, context, hash);
1555 if (BE (new_state == NULL, 0))
1556 *err = REG_ESPACE;
1557
1558 return new_state;
1559 }
1560
1561 /* Finish initialization of the new state NEWSTATE, and using its hash value
1562 HASH put in the appropriate bucket of DFA's state table. Return value
1563 indicates the error code if failed. */
1564
1565 static reg_errcode_t
1566 __attribute_warn_unused_result__
1567 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1568 re_hashval_t hash)
1569 {
1570 struct re_state_table_entry *spot;
1571 reg_errcode_t err;
1572 Idx i;
1573
1574 newstate->hash = hash;
1575 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576 if (BE (err != REG_NOERROR, 0))
1577 return REG_ESPACE;
1578 for (i = 0; i < newstate->nodes.nelem; i++)
1579 {
1580 Idx elem = newstate->nodes.elems[i];
1581 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1583 return REG_ESPACE;
1584 }
1585
1586 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587 if (BE (spot->alloc <= spot->num, 0))
1588 {
1589 Idx new_alloc = 2 * spot->num + 2;
1590 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591 new_alloc);
1592 if (BE (new_array == NULL, 0))
1593 return REG_ESPACE;
1594 spot->array = new_array;
1595 spot->alloc = new_alloc;
1596 }
1597 spot->array[spot->num++] = newstate;
1598 return REG_NOERROR;
1599 }
1600
1601 static void
1602 free_state (re_dfastate_t *state)
1603 {
1604 re_node_set_free (&state->non_eps_nodes);
1605 re_node_set_free (&state->inveclosure);
1606 if (state->entrance_nodes != &state->nodes)
1607 {
1608 re_node_set_free (state->entrance_nodes);
1609 re_free (state->entrance_nodes);
1610 }
1611 re_node_set_free (&state->nodes);
1612 re_free (state->word_trtable);
1613 re_free (state->trtable);
1614 re_free (state);
1615 }
1616
1617 /* Create the new state which is independent of contexts.
1618 Return the new state if succeeded, otherwise return NULL. */
1619
1620 static re_dfastate_t *
1621 internal_function __attribute_warn_unused_result__
1622 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1623 re_hashval_t hash)
1624 {
1625 Idx i;
1626 reg_errcode_t err;
1627 re_dfastate_t *newstate;
1628
1629 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1630 if (BE (newstate == NULL, 0))
1631 return NULL;
1632 err = re_node_set_init_copy (&newstate->nodes, nodes);
1633 if (BE (err != REG_NOERROR, 0))
1634 {
1635 re_free (newstate);
1636 return NULL;
1637 }
1638
1639 newstate->entrance_nodes = &newstate->nodes;
1640 for (i = 0 ; i < nodes->nelem ; i++)
1641 {
1642 re_token_t *node = dfa->nodes + nodes->elems[i];
1643 re_token_type_t type = node->type;
1644 if (type == CHARACTER && !node->constraint)
1645 continue;
1646 #ifdef RE_ENABLE_I18N
1647 newstate->accept_mb |= node->accept_mb;
1648 #endif /* RE_ENABLE_I18N */
1649
1650 /* If the state has the halt node, the state is a halt state. */
1651 if (type == END_OF_RE)
1652 newstate->halt = 1;
1653 else if (type == OP_BACK_REF)
1654 newstate->has_backref = 1;
1655 else if (type == ANCHOR || node->constraint)
1656 newstate->has_constraint = 1;
1657 }
1658 err = register_state (dfa, newstate, hash);
1659 if (BE (err != REG_NOERROR, 0))
1660 {
1661 free_state (newstate);
1662 newstate = NULL;
1663 }
1664 return newstate;
1665 }
1666
1667 /* Create the new state which is depend on the context CONTEXT.
1668 Return the new state if succeeded, otherwise return NULL. */
1669
1670 static re_dfastate_t *
1671 internal_function __attribute_warn_unused_result__
1672 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1673 unsigned int context, re_hashval_t hash)
1674 {
1675 Idx i, nctx_nodes = 0;
1676 reg_errcode_t err;
1677 re_dfastate_t *newstate;
1678
1679 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1680 if (BE (newstate == NULL, 0))
1681 return NULL;
1682 err = re_node_set_init_copy (&newstate->nodes, nodes);
1683 if (BE (err != REG_NOERROR, 0))
1684 {
1685 re_free (newstate);
1686 return NULL;
1687 }
1688
1689 newstate->context = context;
1690 newstate->entrance_nodes = &newstate->nodes;
1691
1692 for (i = 0 ; i < nodes->nelem ; i++)
1693 {
1694 re_token_t *node = dfa->nodes + nodes->elems[i];
1695 re_token_type_t type = node->type;
1696 unsigned int constraint = node->constraint;
1697
1698 if (type == CHARACTER && !constraint)
1699 continue;
1700 #ifdef RE_ENABLE_I18N
1701 newstate->accept_mb |= node->accept_mb;
1702 #endif /* RE_ENABLE_I18N */
1703
1704 /* If the state has the halt node, the state is a halt state. */
1705 if (type == END_OF_RE)
1706 newstate->halt = 1;
1707 else if (type == OP_BACK_REF)
1708 newstate->has_backref = 1;
1709
1710 if (constraint)
1711 {
1712 if (newstate->entrance_nodes == &newstate->nodes)
1713 {
1714 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1715 if (BE (newstate->entrance_nodes == NULL, 0))
1716 {
1717 free_state (newstate);
1718 return NULL;
1719 }
1720 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1721 != REG_NOERROR)
1722 return NULL;
1723 nctx_nodes = 0;
1724 newstate->has_constraint = 1;
1725 }
1726
1727 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1728 {
1729 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1730 ++nctx_nodes;
1731 }
1732 }
1733 }
1734 err = register_state (dfa, newstate, hash);
1735 if (BE (err != REG_NOERROR, 0))
1736 {
1737 free_state (newstate);
1738 newstate = NULL;
1739 }
1740 return newstate;
1741 }