lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2014 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa) internal_function;
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash) internal_function;
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash) internal_function;
  31 \f
  32 /* Functions for string operation.  */
  33
  34 /* This function allocate the buffers.  It is necessary to call
  35    re_string_reconstruct before using the object.  */
  36
  37 static reg_errcode_t
  38 internal_function __attribute_warn_unused_result__
  39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  40                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  41 {
  42   reg_errcode_t ret;
  43   Idx init_buf_len;
  44
  45   /* Ensure at least one character fits into the buffers.  */
  46   if (init_len < dfa->mb_cur_max)
  47     init_len = dfa->mb_cur_max;
  48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  50
  51   ret = re_string_realloc_buffers (pstr, init_buf_len);
  52   if (BE (ret != REG_NOERROR, 0))
  53     return ret;
  54
  55   pstr->word_char = dfa->word_char;
  56   pstr->word_ops_used = dfa->word_ops_used;
  57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  59   pstr->valid_raw_len = pstr->valid_len;
  60   return REG_NOERROR;
  61 }
  62
  63 /* This function allocate the buffers, and initialize them.  */
  64
  65 static reg_errcode_t
  66 internal_function __attribute_warn_unused_result__
  67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  68                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  69 {
  70   reg_errcode_t ret;
  71   memset (pstr, '\0', sizeof (re_string_t));
  72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  73
  74   if (len > 0)
  75     {
  76       ret = re_string_realloc_buffers (pstr, len + 1);
  77       if (BE (ret != REG_NOERROR, 0))
  78         return ret;
  79     }
  80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  81
  82   if (icase)
  83     {
  84 #ifdef RE_ENABLE_I18N
  85       if (dfa->mb_cur_max > 1)
  86         {
  87           while (1)
  88             {
  89               ret = build_wcs_upper_buffer (pstr);
  90               if (BE (ret != REG_NOERROR, 0))
  91                 return ret;
  92               if (pstr->valid_raw_len >= len)
  93                 break;
  94               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  95                 break;
  96               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  97               if (BE (ret != REG_NOERROR, 0))
  98                 return ret;
  99             }
 100         }
 101       else
 102 #endif /* RE_ENABLE_I18N  */
 103         build_upper_buffer (pstr);
 104     }
 105   else
 106     {
 107 #ifdef RE_ENABLE_I18N
 108       if (dfa->mb_cur_max > 1)
 109         build_wcs_buffer (pstr);
 110       else
 111 #endif /* RE_ENABLE_I18N  */
 112         {
 113           if (trans != NULL)
 114             re_string_translate_buffer (pstr);
 115           else
 116             {
 117               pstr->valid_len = pstr->bufs_len;
 118               pstr->valid_raw_len = pstr->bufs_len;
 119             }
 120         }
 121     }
 122
 123   return REG_NOERROR;
 124 }
 125
 126 /* Helper functions for re_string_allocate, and re_string_construct.  */
 127
 128 static reg_errcode_t
 129 internal_function __attribute_warn_unused_result__
 130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs;
 136
 137       /* Avoid overflow in realloc.  */
 138       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 139       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
 140         return REG_ESPACE;
 141
 142       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 143       if (BE (new_wcs == NULL, 0))
 144         return REG_ESPACE;
 145       pstr->wcs = new_wcs;
 146       if (pstr->offsets != NULL)
 147         {
 148           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 149           if (BE (new_offsets == NULL, 0))
 150             return REG_ESPACE;
 151           pstr->offsets = new_offsets;
 152         }
 153     }
 154 #endif /* RE_ENABLE_I18N  */
 155   if (pstr->mbs_allocated)
 156     {
 157       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 158                                            new_buf_len);
 159       if (BE (new_mbs == NULL, 0))
 160         return REG_ESPACE;
 161       pstr->mbs = new_mbs;
 162     }
 163   pstr->bufs_len = new_buf_len;
 164   return REG_NOERROR;
 165 }
 166
 167
 168 static void
 169 internal_function
 170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 171                             RE_TRANSLATE_TYPE trans, bool icase,
 172                             const re_dfa_t *dfa)
 173 {
 174   pstr->raw_mbs = (const unsigned char *) str;
 175   pstr->len = len;
 176   pstr->raw_len = len;
 177   pstr->trans = trans;
 178   pstr->icase = icase;
 179   pstr->mbs_allocated = (trans != NULL || icase);
 180   pstr->mb_cur_max = dfa->mb_cur_max;
 181   pstr->is_utf8 = dfa->is_utf8;
 182   pstr->map_notascii = dfa->map_notascii;
 183   pstr->stop = pstr->len;
 184   pstr->raw_stop = pstr->stop;
 185 }
 186
 187 #ifdef RE_ENABLE_I18N
 188
 189 /* Build wide character buffer PSTR->WCS.
 190    If the byte sequence of the string are:
 191      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 192    Then wide character buffer will be:
 193      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 194    We use WEOF for padding, they indicate that the position isn't
 195    a first byte of a multibyte character.
 196
 197    Note that this function assumes PSTR->VALID_LEN elements are already
 198    built and starts from PSTR->VALID_LEN.  */
 199
 200 static void
 201 internal_function
 202 build_wcs_buffer (re_string_t *pstr)
 203 {
 204 #ifdef _LIBC
 205   unsigned char buf[MB_LEN_MAX];
 206   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 207 #else
 208   unsigned char buf[64];
 209 #endif
 210   mbstate_t prev_st;
 211   Idx byte_idx, end_idx, remain_len;
 212   size_t mbclen;
 213
 214   /* Build the buffers from pstr->valid_len to either pstr->len or
 215      pstr->bufs_len.  */
 216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 218     {
 219       wchar_t wc;
 220       const char *p;
 221
 222       remain_len = end_idx - byte_idx;
 223       prev_st = pstr->cur_state;
 224       /* Apply the translation if we need.  */
 225       if (BE (pstr->trans != NULL, 0))
 226         {
 227           int i, ch;
 228
 229           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 230             {
 231               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 232               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 233             }
 234           p = (const char *) buf;
 235         }
 236       else
 237         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 238       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 239       if (BE (mbclen == (size_t) -1 || mbclen == 0
 240               || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
 241         {
 242           /* We treat these cases as a singlebyte character.  */
 243           mbclen = 1;
 244           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 245           if (BE (pstr->trans != NULL, 0))
 246             wc = pstr->trans[wc];
 247           pstr->cur_state = prev_st;
 248         }
 249       else if (BE (mbclen == (size_t) -2, 0))
 250         {
 251           /* The buffer doesn't have enough space, finish to build.  */
 252           pstr->cur_state = prev_st;
 253           break;
 254         }
 255
 256       /* Write wide character and padding.  */
 257       pstr->wcs[byte_idx++] = wc;
 258       /* Write paddings.  */
 259       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 260         pstr->wcs[byte_idx++] = WEOF;
 261     }
 262   pstr->valid_len = byte_idx;
 263   pstr->valid_raw_len = byte_idx;
 264 }
 265
 266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 267    but for REG_ICASE.  */
 268
 269 static reg_errcode_t
 270 internal_function __attribute_warn_unused_result__
 271 build_wcs_upper_buffer (re_string_t *pstr)
 272 {
 273   mbstate_t prev_st;
 274   Idx src_idx, byte_idx, end_idx, remain_len;
 275   size_t mbclen;
 276 #ifdef _LIBC
 277   char buf[MB_LEN_MAX];
 278   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 279 #else
 280   char buf[64];
 281 #endif
 282
 283   byte_idx = pstr->valid_len;
 284   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 285
 286   /* The following optimization assumes that ASCII characters can be
 287      mapped to wide characters with a simple cast.  */
 288   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 289     {
 290       while (byte_idx < end_idx)
 291         {
 292           wchar_t wc;
 293
 294           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 295               && mbsinit (&pstr->cur_state))
 296             {
 297               /* In case of a singlebyte character.  */
 298               pstr->mbs[byte_idx]
 299                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 300               /* The next step uses the assumption that wchar_t is encoded
 301                  ASCII-safe: all ASCII values can be converted like this.  */
 302               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 303               ++byte_idx;
 304               continue;
 305             }
 306
 307           remain_len = end_idx - byte_idx;
 308           prev_st = pstr->cur_state;
 309           mbclen = __mbrtowc (&wc,
 310                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 311                                + byte_idx), remain_len, &pstr->cur_state);
 312           if (BE (mbclen < (size_t) -2, 1))
 313             {
 314               wchar_t wcu = wc;
 315               if (iswlower (wc))
 316                 {
 317                   size_t mbcdlen;
 318
 319                   wcu = towupper (wc);
 320                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 321                   if (BE (mbclen == mbcdlen, 1))
 322                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 323                   else
 324                     {
 325                       src_idx = byte_idx;
 326                       goto offsets_needed;
 327                     }
 328                 }
 329               else
 330                 memcpy (pstr->mbs + byte_idx,
 331                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 332               pstr->wcs[byte_idx++] = wcu;
 333               /* Write paddings.  */
 334               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 335                 pstr->wcs[byte_idx++] = WEOF;
 336             }
 337           else if (mbclen == (size_t) -1 || mbclen == 0
 338                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 339             {
 340               /* It is an invalid character, an incomplete character
 341                  at the end of the string, or '\0'.  Just use the byte.  */
 342               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 343               pstr->mbs[byte_idx] = ch;
 344               /* And also cast it to wide char.  */
 345               pstr->wcs[byte_idx++] = (wchar_t) ch;
 346               if (BE (mbclen == (size_t) -1, 0))
 347                 pstr->cur_state = prev_st;
 348             }
 349           else
 350             {
 351               /* The buffer doesn't have enough space, finish to build.  */
 352               pstr->cur_state = prev_st;
 353               break;
 354             }
 355         }
 356       pstr->valid_len = byte_idx;
 357       pstr->valid_raw_len = byte_idx;
 358       return REG_NOERROR;
 359     }
 360   else
 361     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 362       {
 363         wchar_t wc;
 364         const char *p;
 365       offsets_needed:
 366         remain_len = end_idx - byte_idx;
 367         prev_st = pstr->cur_state;
 368         if (BE (pstr->trans != NULL, 0))
 369           {
 370             int i, ch;
 371
 372             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 373               {
 374                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 375                 buf[i] = pstr->trans[ch];
 376               }
 377             p = (const char *) buf;
 378           }
 379         else
 380           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 381         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 382         if (BE (mbclen < (size_t) -2, 1))
 383           {
 384             wchar_t wcu = wc;
 385             if (iswlower (wc))
 386               {
 387                 size_t mbcdlen;
 388
 389                 wcu = towupper (wc);
 390                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 391                 if (BE (mbclen == mbcdlen, 1))
 392                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 393                 else if (mbcdlen != (size_t) -1)
 394                   {
 395                     size_t i;
 396
 397                     if (byte_idx + mbcdlen > pstr->bufs_len)
 398                       {
 399                         pstr->cur_state = prev_st;
 400                         break;
 401                       }
 402
 403                     if (pstr->offsets == NULL)
 404                       {
 405                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 406
 407                         if (pstr->offsets == NULL)
 408                           return REG_ESPACE;
 409                       }
 410                     if (!pstr->offsets_needed)
 411                       {
 412                         for (i = 0; i < (size_t) byte_idx; ++i)
 413                           pstr->offsets[i] = i;
 414                         pstr->offsets_needed = 1;
 415                       }
 416
 417                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 418                     pstr->wcs[byte_idx] = wcu;
 419                     pstr->offsets[byte_idx] = src_idx;
 420                     for (i = 1; i < mbcdlen; ++i)
 421                       {
 422                         pstr->offsets[byte_idx + i]
 423                           = src_idx + (i < mbclen ? i : mbclen - 1);
 424                         pstr->wcs[byte_idx + i] = WEOF;
 425                       }
 426                     pstr->len += mbcdlen - mbclen;
 427                     if (pstr->raw_stop > src_idx)
 428                       pstr->stop += mbcdlen - mbclen;
 429                     end_idx = (pstr->bufs_len > pstr->len)
 430                               ? pstr->len : pstr->bufs_len;
 431                     byte_idx += mbcdlen;
 432                     src_idx += mbclen;
 433                     continue;
 434                   }
 435                 else
 436                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 437               }
 438             else
 439               memcpy (pstr->mbs + byte_idx, p, mbclen);
 440
 441             if (BE (pstr->offsets_needed != 0, 0))
 442               {
 443                 size_t i;
 444                 for (i = 0; i < mbclen; ++i)
 445                   pstr->offsets[byte_idx + i] = src_idx + i;
 446               }
 447             src_idx += mbclen;
 448
 449             pstr->wcs[byte_idx++] = wcu;
 450             /* Write paddings.  */
 451             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 452               pstr->wcs[byte_idx++] = WEOF;
 453           }
 454         else if (mbclen == (size_t) -1 || mbclen == 0
 455                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 456           {
 457             /* It is an invalid character or '\0'.  Just use the byte.  */
 458             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 459
 460             if (BE (pstr->trans != NULL, 0))
 461               ch = pstr->trans [ch];
 462             pstr->mbs[byte_idx] = ch;
 463
 464             if (BE (pstr->offsets_needed != 0, 0))
 465               pstr->offsets[byte_idx] = src_idx;
 466             ++src_idx;
 467
 468             /* And also cast it to wide char.  */
 469             pstr->wcs[byte_idx++] = (wchar_t) ch;
 470             if (BE (mbclen == (size_t) -1, 0))
 471               pstr->cur_state = prev_st;
 472           }
 473         else
 474           {
 475             /* The buffer doesn't have enough space, finish to build.  */
 476             pstr->cur_state = prev_st;
 477             break;
 478           }
 479       }
 480   pstr->valid_len = byte_idx;
 481   pstr->valid_raw_len = src_idx;
 482   return REG_NOERROR;
 483 }
 484
 485 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 486    Return the index.  */
 487
 488 static Idx
 489 internal_function
 490 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 491 {
 492   mbstate_t prev_st;
 493   Idx rawbuf_idx;
 494   size_t mbclen;
 495   wint_t wc = WEOF;
 496
 497   /* Skip the characters which are not necessary to check.  */
 498   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 499        rawbuf_idx < new_raw_idx;)
 500     {
 501       wchar_t wc2;
 502       Idx remain_len = pstr->raw_len - rawbuf_idx;
 503       prev_st = pstr->cur_state;
 504       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 505                           remain_len, &pstr->cur_state);
 506       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 507         {
 508           /* We treat these cases as a single byte character.  */
 509           if (mbclen == 0 || remain_len == 0)
 510             wc = L'\0';
 511           else
 512             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 513           mbclen = 1;
 514           pstr->cur_state = prev_st;
 515         }
 516       else
 517         wc = wc2;
 518       /* Then proceed the next character.  */
 519       rawbuf_idx += mbclen;
 520     }
 521   *last_wc = wc;
 522   return rawbuf_idx;
 523 }
 524 #endif /* RE_ENABLE_I18N  */
 525
 526 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 527    This function is used in case of REG_ICASE.  */
 528
 529 static void
 530 internal_function
 531 build_upper_buffer (re_string_t *pstr)
 532 {
 533   Idx char_idx, end_idx;
 534   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 535
 536   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 537     {
 538       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 539       if (BE (pstr->trans != NULL, 0))
 540         ch = pstr->trans[ch];
 541       if (islower (ch))
 542         pstr->mbs[char_idx] = toupper (ch);
 543       else
 544         pstr->mbs[char_idx] = ch;
 545     }
 546   pstr->valid_len = char_idx;
 547   pstr->valid_raw_len = char_idx;
 548 }
 549
 550 /* Apply TRANS to the buffer in PSTR.  */
 551
 552 static void
 553 internal_function
 554 re_string_translate_buffer (re_string_t *pstr)
 555 {
 556   Idx buf_idx, end_idx;
 557   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 558
 559   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 560     {
 561       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 562       pstr->mbs[buf_idx] = pstr->trans[ch];
 563     }
 564
 565   pstr->valid_len = buf_idx;
 566   pstr->valid_raw_len = buf_idx;
 567 }
 568
 569 /* This function re-construct the buffers.
 570    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 571    convert to upper case in case of REG_ICASE, apply translation.  */
 572
 573 static reg_errcode_t
 574 internal_function __attribute_warn_unused_result__
 575 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 576 {
 577   Idx offset;
 578
 579   if (BE (pstr->raw_mbs_idx <= idx, 0))
 580     offset = idx - pstr->raw_mbs_idx;
 581   else
 582     {
 583       /* Reset buffer.  */
 584 #ifdef RE_ENABLE_I18N
 585       if (pstr->mb_cur_max > 1)
 586         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 587 #endif /* RE_ENABLE_I18N */
 588       pstr->len = pstr->raw_len;
 589       pstr->stop = pstr->raw_stop;
 590       pstr->valid_len = 0;
 591       pstr->raw_mbs_idx = 0;
 592       pstr->valid_raw_len = 0;
 593       pstr->offsets_needed = 0;
 594       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 595                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 596       if (!pstr->mbs_allocated)
 597         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 598       offset = idx;
 599     }
 600
 601   if (BE (offset != 0, 1))
 602     {
 603       /* Should the already checked characters be kept?  */
 604       if (BE (offset < pstr->valid_raw_len, 1))
 605         {
 606           /* Yes, move them to the front of the buffer.  */
 607 #ifdef RE_ENABLE_I18N
 608           if (BE (pstr->offsets_needed, 0))
 609             {
 610               Idx low = 0, high = pstr->valid_len, mid;
 611               do
 612                 {
 613                   mid = (high + low) / 2;
 614                   if (pstr->offsets[mid] > offset)
 615                     high = mid;
 616                   else if (pstr->offsets[mid] < offset)
 617                     low = mid + 1;
 618                   else
 619                     break;
 620                 }
 621               while (low < high);
 622               if (pstr->offsets[mid] < offset)
 623                 ++mid;
 624               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 625                                                         eflags);
 626               /* This can be quite complicated, so handle specially
 627                  only the common and easy case where the character with
 628                  different length representation of lower and upper
 629                  case is present at or after offset.  */
 630               if (pstr->valid_len > offset
 631                   && mid == offset && pstr->offsets[mid] == offset)
 632                 {
 633                   memmove (pstr->wcs, pstr->wcs + offset,
 634                            (pstr->valid_len - offset) * sizeof (wint_t));
 635                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 636                   pstr->valid_len -= offset;
 637                   pstr->valid_raw_len -= offset;
 638                   for (low = 0; low < pstr->valid_len; low++)
 639                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 640                 }
 641               else
 642                 {
 643                   /* Otherwise, just find out how long the partial multibyte
 644                      character at offset is and fill it with WEOF/255.  */
 645                   pstr->len = pstr->raw_len - idx + offset;
 646                   pstr->stop = pstr->raw_stop - idx + offset;
 647                   pstr->offsets_needed = 0;
 648                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 649                     --mid;
 650                   while (mid < pstr->valid_len)
 651                     if (pstr->wcs[mid] != WEOF)
 652                       break;
 653                     else
 654                       ++mid;
 655                   if (mid == pstr->valid_len)
 656                     pstr->valid_len = 0;
 657                   else
 658                     {
 659                       pstr->valid_len = pstr->offsets[mid] - offset;
 660                       if (pstr->valid_len)
 661                         {
 662                           for (low = 0; low < pstr->valid_len; ++low)
 663                             pstr->wcs[low] = WEOF;
 664                           memset (pstr->mbs, 255, pstr->valid_len);
 665                         }
 666                     }
 667                   pstr->valid_raw_len = pstr->valid_len;
 668                 }
 669             }
 670           else
 671 #endif
 672             {
 673               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 674                                                         eflags);
 675 #ifdef RE_ENABLE_I18N
 676               if (pstr->mb_cur_max > 1)
 677                 memmove (pstr->wcs, pstr->wcs + offset,
 678                          (pstr->valid_len - offset) * sizeof (wint_t));
 679 #endif /* RE_ENABLE_I18N */
 680               if (BE (pstr->mbs_allocated, 0))
 681                 memmove (pstr->mbs, pstr->mbs + offset,
 682                          pstr->valid_len - offset);
 683               pstr->valid_len -= offset;
 684               pstr->valid_raw_len -= offset;
 685 #if DEBUG
 686               assert (pstr->valid_len > 0);
 687 #endif
 688             }
 689         }
 690       else
 691         {
 692 #ifdef RE_ENABLE_I18N
 693           /* No, skip all characters until IDX.  */
 694           Idx prev_valid_len = pstr->valid_len;
 695
 696           if (BE (pstr->offsets_needed, 0))
 697             {
 698               pstr->len = pstr->raw_len - idx + offset;
 699               pstr->stop = pstr->raw_stop - idx + offset;
 700               pstr->offsets_needed = 0;
 701             }
 702 #endif
 703           pstr->valid_len = 0;
 704 #ifdef RE_ENABLE_I18N
 705           if (pstr->mb_cur_max > 1)
 706             {
 707               Idx wcs_idx;
 708               wint_t wc = WEOF;
 709
 710               if (pstr->is_utf8)
 711                 {
 712                   const unsigned char *raw, *p, *end;
 713
 714                   /* Special case UTF-8.  Multi-byte chars start with any
 715                      byte other than 0x80 - 0xbf.  */
 716                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 717                   end = raw + (offset - pstr->mb_cur_max);
 718                   if (end < pstr->raw_mbs)
 719                     end = pstr->raw_mbs;
 720                   p = raw + offset - 1;
 721 #ifdef _LIBC
 722                   /* We know the wchar_t encoding is UCS4, so for the simple
 723                      case, ASCII characters, skip the conversion step.  */
 724                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 725                     {
 726                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 727                       /* pstr->valid_len = 0; */
 728                       wc = (wchar_t) *p;
 729                     }
 730                   else
 731 #endif
 732                     for (; p >= end; --p)
 733                       if ((*p & 0xc0) != 0x80)
 734                         {
 735                           mbstate_t cur_state;
 736                           wchar_t wc2;
 737                           Idx mlen = raw + pstr->len - p;
 738                           unsigned char buf[6];
 739                           size_t mbclen;
 740
 741                           const unsigned char *pp = p;
 742                           if (BE (pstr->trans != NULL, 0))
 743                             {
 744                               int i = mlen < 6 ? mlen : 6;
 745                               while (--i >= 0)
 746                                 buf[i] = pstr->trans[p[i]];
 747                               pp = buf;
 748                             }
 749                           /* XXX Don't use mbrtowc, we know which conversion
 750                              to use (UTF-8 -> UCS4).  */
 751                           memset (&cur_state, 0, sizeof (cur_state));
 752                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 753                                               &cur_state);
 754                           if (raw + offset - p <= mbclen
 755                               && mbclen < (size_t) -2)
 756                             {
 757                               memset (&pstr->cur_state, '\0',
 758                                       sizeof (mbstate_t));
 759                               pstr->valid_len = mbclen - (raw + offset - p);
 760                               wc = wc2;
 761                             }
 762                           break;
 763                         }
 764                 }
 765
 766               if (wc == WEOF)
 767                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 768               if (wc == WEOF)
 769                 pstr->tip_context
 770                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 771               else
 772                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 773                                       && IS_WIDE_WORD_CHAR (wc))
 774                                      ? CONTEXT_WORD
 775                                      : ((IS_WIDE_NEWLINE (wc)
 776                                          && pstr->newline_anchor)
 777                                         ? CONTEXT_NEWLINE : 0));
 778               if (BE (pstr->valid_len, 0))
 779                 {
 780                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 781                     pstr->wcs[wcs_idx] = WEOF;
 782                   if (pstr->mbs_allocated)
 783                     memset (pstr->mbs, 255, pstr->valid_len);
 784                 }
 785               pstr->valid_raw_len = pstr->valid_len;
 786             }
 787           else
 788 #endif /* RE_ENABLE_I18N */
 789             {
 790               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 791               pstr->valid_raw_len = 0;
 792               if (pstr->trans)
 793                 c = pstr->trans[c];
 794               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 795                                    ? CONTEXT_WORD
 796                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 797                                       ? CONTEXT_NEWLINE : 0));
 798             }
 799         }
 800       if (!BE (pstr->mbs_allocated, 0))
 801         pstr->mbs += offset;
 802     }
 803   pstr->raw_mbs_idx = idx;
 804   pstr->len -= offset;
 805   pstr->stop -= offset;
 806
 807   /* Then build the buffers.  */
 808 #ifdef RE_ENABLE_I18N
 809   if (pstr->mb_cur_max > 1)
 810     {
 811       if (pstr->icase)
 812         {
 813           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 814           if (BE (ret != REG_NOERROR, 0))
 815             return ret;
 816         }
 817       else
 818         build_wcs_buffer (pstr);
 819     }
 820   else
 821 #endif /* RE_ENABLE_I18N */
 822     if (BE (pstr->mbs_allocated, 0))
 823       {
 824         if (pstr->icase)
 825           build_upper_buffer (pstr);
 826         else if (pstr->trans != NULL)
 827           re_string_translate_buffer (pstr);
 828       }
 829     else
 830       pstr->valid_len = pstr->len;
 831
 832   pstr->cur_idx = 0;
 833   return REG_NOERROR;
 834 }
 835
 836 static unsigned char
 837 internal_function __attribute__ ((pure))
 838 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 839 {
 840   int ch;
 841   Idx off;
 842
 843   /* Handle the common (easiest) cases first.  */
 844   if (BE (!pstr->mbs_allocated, 1))
 845     return re_string_peek_byte (pstr, idx);
 846
 847 #ifdef RE_ENABLE_I18N
 848   if (pstr->mb_cur_max > 1
 849       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 850     return re_string_peek_byte (pstr, idx);
 851 #endif
 852
 853   off = pstr->cur_idx + idx;
 854 #ifdef RE_ENABLE_I18N
 855   if (pstr->offsets_needed)
 856     off = pstr->offsets[off];
 857 #endif
 858
 859   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 860
 861 #ifdef RE_ENABLE_I18N
 862   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 863      this function returns CAPITAL LETTER I instead of first byte of
 864      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 865      since peek_byte_case doesn't advance cur_idx in any way.  */
 866   if (pstr->offsets_needed && !isascii (ch))
 867     return re_string_peek_byte (pstr, idx);
 868 #endif
 869
 870   return ch;
 871 }
 872
 873 static unsigned char
 874 internal_function
 875 re_string_fetch_byte_case (re_string_t *pstr)
 876 {
 877   if (BE (!pstr->mbs_allocated, 1))
 878     return re_string_fetch_byte (pstr);
 879
 880 #ifdef RE_ENABLE_I18N
 881   if (pstr->offsets_needed)
 882     {
 883       Idx off;
 884       int ch;
 885
 886       /* For tr_TR.UTF-8 [[:islower:]] there is
 887          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 888          in that case the whole multi-byte character and return
 889          the original letter.  On the other side, with
 890          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 891          anything else would complicate things too much.  */
 892
 893       if (!re_string_first_byte (pstr, pstr->cur_idx))
 894         return re_string_fetch_byte (pstr);
 895
 896       off = pstr->offsets[pstr->cur_idx];
 897       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 898
 899       if (! isascii (ch))
 900         return re_string_fetch_byte (pstr);
 901
 902       re_string_skip_bytes (pstr,
 903                             re_string_char_size_at (pstr, pstr->cur_idx));
 904       return ch;
 905     }
 906 #endif
 907
 908   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 909 }
 910
 911 static void
 912 internal_function
 913 re_string_destruct (re_string_t *pstr)
 914 {
 915 #ifdef RE_ENABLE_I18N
 916   re_free (pstr->wcs);
 917   re_free (pstr->offsets);
 918 #endif /* RE_ENABLE_I18N  */
 919   if (pstr->mbs_allocated)
 920     re_free (pstr->mbs);
 921 }
 922
 923 /* Return the context at IDX in INPUT.  */
 924
 925 static unsigned int
 926 internal_function
 927 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 928 {
 929   int c;
 930   if (BE (! REG_VALID_INDEX (idx), 0))
 931     /* In this case, we use the value stored in input->tip_context,
 932        since we can't know the character in input->mbs[-1] here.  */
 933     return input->tip_context;
 934   if (BE (idx == input->len, 0))
 935     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 936             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 937 #ifdef RE_ENABLE_I18N
 938   if (input->mb_cur_max > 1)
 939     {
 940       wint_t wc;
 941       Idx wc_idx = idx;
 942       while(input->wcs[wc_idx] == WEOF)
 943         {
 944 #ifdef DEBUG
 945           /* It must not happen.  */
 946           assert (REG_VALID_INDEX (wc_idx));
 947 #endif
 948           --wc_idx;
 949           if (! REG_VALID_INDEX (wc_idx))
 950             return input->tip_context;
 951         }
 952       wc = input->wcs[wc_idx];
 953       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 954         return CONTEXT_WORD;
 955       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 956               ? CONTEXT_NEWLINE : 0);
 957     }
 958   else
 959 #endif
 960     {
 961       c = re_string_byte_at (input, idx);
 962       if (bitset_contain (input->word_char, c))
 963         return CONTEXT_WORD;
 964       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 965     }
 966 }
 967 \f
 968 /* Functions for set operation.  */
 969
 970 static reg_errcode_t
 971 internal_function __attribute_warn_unused_result__
 972 re_node_set_alloc (re_node_set *set, Idx size)
 973 {
 974   set->alloc = size;
 975   set->nelem = 0;
 976   set->elems = re_malloc (Idx, size);
 977   if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
 978     return REG_ESPACE;
 979   return REG_NOERROR;
 980 }
 981
 982 static reg_errcode_t
 983 internal_function __attribute_warn_unused_result__
 984 re_node_set_init_1 (re_node_set *set, Idx elem)
 985 {
 986   set->alloc = 1;
 987   set->nelem = 1;
 988   set->elems = re_malloc (Idx, 1);
 989   if (BE (set->elems == NULL, 0))
 990     {
 991       set->alloc = set->nelem = 0;
 992       return REG_ESPACE;
 993     }
 994   set->elems[0] = elem;
 995   return REG_NOERROR;
 996 }
 997
 998 static reg_errcode_t
 999 internal_function __attribute_warn_unused_result__
1000 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1001 {
1002   set->alloc = 2;
1003   set->elems = re_malloc (Idx, 2);
1004   if (BE (set->elems == NULL, 0))
1005     return REG_ESPACE;
1006   if (elem1 == elem2)
1007     {
1008       set->nelem = 1;
1009       set->elems[0] = elem1;
1010     }
1011   else
1012     {
1013       set->nelem = 2;
1014       if (elem1 < elem2)
1015         {
1016           set->elems[0] = elem1;
1017           set->elems[1] = elem2;
1018         }
1019       else
1020         {
1021           set->elems[0] = elem2;
1022           set->elems[1] = elem1;
1023         }
1024     }
1025   return REG_NOERROR;
1026 }
1027
1028 static reg_errcode_t
1029 internal_function __attribute_warn_unused_result__
1030 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1031 {
1032   dest->nelem = src->nelem;
1033   if (src->nelem > 0)
1034     {
1035       dest->alloc = dest->nelem;
1036       dest->elems = re_malloc (Idx, dest->alloc);
1037       if (BE (dest->elems == NULL, 0))
1038         {
1039           dest->alloc = dest->nelem = 0;
1040           return REG_ESPACE;
1041         }
1042       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1043     }
1044   else
1045     re_node_set_init_empty (dest);
1046   return REG_NOERROR;
1047 }
1048
1049 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1050    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1051    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1052
1053 static reg_errcode_t
1054 internal_function __attribute_warn_unused_result__
1055 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1056                            const re_node_set *src2)
1057 {
1058   Idx i1, i2, is, id, delta, sbase;
1059   if (src1->nelem == 0 || src2->nelem == 0)
1060     return REG_NOERROR;
1061
1062   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1063      conservative estimate.  */
1064   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1065     {
1066       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1067       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1068       if (BE (new_elems == NULL, 0))
1069         return REG_ESPACE;
1070       dest->elems = new_elems;
1071       dest->alloc = new_alloc;
1072     }
1073
1074   /* Find the items in the intersection of SRC1 and SRC2, and copy
1075      into the top of DEST those that are not already in DEST itself.  */
1076   sbase = dest->nelem + src1->nelem + src2->nelem;
1077   i1 = src1->nelem - 1;
1078   i2 = src2->nelem - 1;
1079   id = dest->nelem - 1;
1080   for (;;)
1081     {
1082       if (src1->elems[i1] == src2->elems[i2])
1083         {
1084           /* Try to find the item in DEST.  Maybe we could binary search?  */
1085           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1086             --id;
1087
1088           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1089             dest->elems[--sbase] = src1->elems[i1];
1090
1091           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1092             break;
1093         }
1094
1095       /* Lower the highest of the two items.  */
1096       else if (src1->elems[i1] < src2->elems[i2])
1097         {
1098           if (! REG_VALID_INDEX (--i2))
1099             break;
1100         }
1101       else
1102         {
1103           if (! REG_VALID_INDEX (--i1))
1104             break;
1105         }
1106     }
1107
1108   id = dest->nelem - 1;
1109   is = dest->nelem + src1->nelem + src2->nelem - 1;
1110   delta = is - sbase + 1;
1111
1112   /* Now copy.  When DELTA becomes zero, the remaining
1113      DEST elements are already in place; this is more or
1114      less the same loop that is in re_node_set_merge.  */
1115   dest->nelem += delta;
1116   if (delta > 0 && REG_VALID_INDEX (id))
1117     for (;;)
1118       {
1119         if (dest->elems[is] > dest->elems[id])
1120           {
1121             /* Copy from the top.  */
1122             dest->elems[id + delta--] = dest->elems[is--];
1123             if (delta == 0)
1124               break;
1125           }
1126         else
1127           {
1128             /* Slide from the bottom.  */
1129             dest->elems[id + delta] = dest->elems[id];
1130             if (! REG_VALID_INDEX (--id))
1131               break;
1132           }
1133       }
1134
1135   /* Copy remaining SRC elements.  */
1136   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1137
1138   return REG_NOERROR;
1139 }
1140
1141 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1142    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1143
1144 static reg_errcode_t
1145 internal_function __attribute_warn_unused_result__
1146 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1147                         const re_node_set *src2)
1148 {
1149   Idx i1, i2, id;
1150   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1151     {
1152       dest->alloc = src1->nelem + src2->nelem;
1153       dest->elems = re_malloc (Idx, dest->alloc);
1154       if (BE (dest->elems == NULL, 0))
1155         return REG_ESPACE;
1156     }
1157   else
1158     {
1159       if (src1 != NULL && src1->nelem > 0)
1160         return re_node_set_init_copy (dest, src1);
1161       else if (src2 != NULL && src2->nelem > 0)
1162         return re_node_set_init_copy (dest, src2);
1163       else
1164         re_node_set_init_empty (dest);
1165       return REG_NOERROR;
1166     }
1167   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1168     {
1169       if (src1->elems[i1] > src2->elems[i2])
1170         {
1171           dest->elems[id++] = src2->elems[i2++];
1172           continue;
1173         }
1174       if (src1->elems[i1] == src2->elems[i2])
1175         ++i2;
1176       dest->elems[id++] = src1->elems[i1++];
1177     }
1178   if (i1 < src1->nelem)
1179     {
1180       memcpy (dest->elems + id, src1->elems + i1,
1181              (src1->nelem - i1) * sizeof (Idx));
1182       id += src1->nelem - i1;
1183     }
1184   else if (i2 < src2->nelem)
1185     {
1186       memcpy (dest->elems + id, src2->elems + i2,
1187              (src2->nelem - i2) * sizeof (Idx));
1188       id += src2->nelem - i2;
1189     }
1190   dest->nelem = id;
1191   return REG_NOERROR;
1192 }
1193
1194 /* Calculate the union set of the sets DEST and SRC. And store it to
1195    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1196
1197 static reg_errcode_t
1198 internal_function __attribute_warn_unused_result__
1199 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1200 {
1201   Idx is, id, sbase, delta;
1202   if (src == NULL || src->nelem == 0)
1203     return REG_NOERROR;
1204   if (dest->alloc < 2 * src->nelem + dest->nelem)
1205     {
1206       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1207       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1208       if (BE (new_buffer == NULL, 0))
1209         return REG_ESPACE;
1210       dest->elems = new_buffer;
1211       dest->alloc = new_alloc;
1212     }
1213
1214   if (BE (dest->nelem == 0, 0))
1215     {
1216       dest->nelem = src->nelem;
1217       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1218       return REG_NOERROR;
1219     }
1220
1221   /* Copy into the top of DEST the items of SRC that are not
1222      found in DEST.  Maybe we could binary search in DEST?  */
1223   for (sbase = dest->nelem + 2 * src->nelem,
1224        is = src->nelem - 1, id = dest->nelem - 1;
1225        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1226     {
1227       if (dest->elems[id] == src->elems[is])
1228         is--, id--;
1229       else if (dest->elems[id] < src->elems[is])
1230         dest->elems[--sbase] = src->elems[is--];
1231       else /* if (dest->elems[id] > src->elems[is]) */
1232         --id;
1233     }
1234
1235   if (REG_VALID_INDEX (is))
1236     {
1237       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1238       sbase -= is + 1;
1239       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1240     }
1241
1242   id = dest->nelem - 1;
1243   is = dest->nelem + 2 * src->nelem - 1;
1244   delta = is - sbase + 1;
1245   if (delta == 0)
1246     return REG_NOERROR;
1247
1248   /* Now copy.  When DELTA becomes zero, the remaining
1249      DEST elements are already in place.  */
1250   dest->nelem += delta;
1251   for (;;)
1252     {
1253       if (dest->elems[is] > dest->elems[id])
1254         {
1255           /* Copy from the top.  */
1256           dest->elems[id + delta--] = dest->elems[is--];
1257           if (delta == 0)
1258             break;
1259         }
1260       else
1261         {
1262           /* Slide from the bottom.  */
1263           dest->elems[id + delta] = dest->elems[id];
1264           if (! REG_VALID_INDEX (--id))
1265             {
1266               /* Copy remaining SRC elements.  */
1267               memcpy (dest->elems, dest->elems + sbase,
1268                       delta * sizeof (Idx));
1269               break;
1270             }
1271         }
1272     }
1273
1274   return REG_NOERROR;
1275 }
1276
1277 /* Insert the new element ELEM to the re_node_set* SET.
1278    SET should not already have ELEM.
1279    Return true if successful.  */
1280
1281 static bool
1282 internal_function __attribute_warn_unused_result__
1283 re_node_set_insert (re_node_set *set, Idx elem)
1284 {
1285   Idx idx;
1286   /* In case the set is empty.  */
1287   if (set->alloc == 0)
1288     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1289
1290   if (BE (set->nelem, 0) == 0)
1291     {
1292       /* We already guaranteed above that set->alloc != 0.  */
1293       set->elems[0] = elem;
1294       ++set->nelem;
1295       return true;
1296     }
1297
1298   /* Realloc if we need.  */
1299   if (set->alloc == set->nelem)
1300     {
1301       Idx *new_elems;
1302       set->alloc = set->alloc * 2;
1303       new_elems = re_realloc (set->elems, Idx, set->alloc);
1304       if (BE (new_elems == NULL, 0))
1305         return false;
1306       set->elems = new_elems;
1307     }
1308
1309   /* Move the elements which follows the new element.  Test the
1310      first element separately to skip a check in the inner loop.  */
1311   if (elem < set->elems[0])
1312     {
1313       idx = 0;
1314       for (idx = set->nelem; idx > 0; idx--)
1315         set->elems[idx] = set->elems[idx - 1];
1316     }
1317   else
1318     {
1319       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1320         set->elems[idx] = set->elems[idx - 1];
1321     }
1322
1323   /* Insert the new element.  */
1324   set->elems[idx] = elem;
1325   ++set->nelem;
1326   return true;
1327 }
1328
1329 /* Insert the new element ELEM to the re_node_set* SET.
1330    SET should not already have any element greater than or equal to ELEM.
1331    Return true if successful.  */
1332
1333 static bool
1334 internal_function __attribute_warn_unused_result__
1335 re_node_set_insert_last (re_node_set *set, Idx elem)
1336 {
1337   /* Realloc if we need.  */
1338   if (set->alloc == set->nelem)
1339     {
1340       Idx *new_elems;
1341       set->alloc = (set->alloc + 1) * 2;
1342       new_elems = re_realloc (set->elems, Idx, set->alloc);
1343       if (BE (new_elems == NULL, 0))
1344         return false;
1345       set->elems = new_elems;
1346     }
1347
1348   /* Insert the new element.  */
1349   set->elems[set->nelem++] = elem;
1350   return true;
1351 }
1352
1353 /* Compare two node sets SET1 and SET2.
1354    Return true if SET1 and SET2 are equivalent.  */
1355
1356 static bool
1357 internal_function __attribute__ ((pure))
1358 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1359 {
1360   Idx i;
1361   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1362     return false;
1363   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1364     if (set1->elems[i] != set2->elems[i])
1365       return false;
1366   return true;
1367 }
1368
1369 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1370
1371 static Idx
1372 internal_function __attribute__ ((pure))
1373 re_node_set_contains (const re_node_set *set, Idx elem)
1374 {
1375   __re_size_t idx, right, mid;
1376   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1377     return 0;
1378
1379   /* Binary search the element.  */
1380   idx = 0;
1381   right = set->nelem - 1;
1382   while (idx < right)
1383     {
1384       mid = (idx + right) / 2;
1385       if (set->elems[mid] < elem)
1386         idx = mid + 1;
1387       else
1388         right = mid;
1389     }
1390   return set->elems[idx] == elem ? idx + 1 : 0;
1391 }
1392
1393 static void
1394 internal_function
1395 re_node_set_remove_at (re_node_set *set, Idx idx)
1396 {
1397   if (idx < 0 || idx >= set->nelem)
1398     return;
1399   --set->nelem;
1400   for (; idx < set->nelem; idx++)
1401     set->elems[idx] = set->elems[idx + 1];
1402 }
1403 \f
1404
1405 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406    Or return REG_MISSING if an error occurred.  */
1407
1408 static Idx
1409 internal_function
1410 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1411 {
1412   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1413     {
1414       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1415       Idx *new_nexts, *new_indices;
1416       re_node_set *new_edests, *new_eclosures;
1417       re_token_t *new_nodes;
1418
1419       /* Avoid overflows in realloc.  */
1420       const size_t max_object_size = MAX (sizeof (re_token_t),
1421                                           MAX (sizeof (re_node_set),
1422                                                sizeof (Idx)));
1423       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1424         return REG_MISSING;
1425
1426       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1427       if (BE (new_nodes == NULL, 0))
1428         return REG_MISSING;
1429       dfa->nodes = new_nodes;
1430       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1431       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1432       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1433       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1434       if (BE (new_nexts == NULL || new_indices == NULL
1435               || new_edests == NULL || new_eclosures == NULL, 0))
1436         return REG_MISSING;
1437       dfa->nexts = new_nexts;
1438       dfa->org_indices = new_indices;
1439       dfa->edests = new_edests;
1440       dfa->eclosures = new_eclosures;
1441       dfa->nodes_alloc = new_nodes_alloc;
1442     }
1443   dfa->nodes[dfa->nodes_len] = token;
1444   dfa->nodes[dfa->nodes_len].constraint = 0;
1445 #ifdef RE_ENABLE_I18N
1446   dfa->nodes[dfa->nodes_len].accept_mb =
1447     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448      || token.type == COMPLEX_BRACKET);
1449 #endif
1450   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1451   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453   return dfa->nodes_len++;
1454 }
1455
1456 static re_hashval_t
1457 internal_function
1458 calc_state_hash (const re_node_set *nodes, unsigned int context)
1459 {
1460   re_hashval_t hash = nodes->nelem + context;
1461   Idx i;
1462   for (i = 0 ; i < nodes->nelem ; i++)
1463     hash += nodes->elems[i];
1464   return hash;
1465 }
1466
1467 /* Search for the state whose node_set is equivalent to NODES.
1468    Return the pointer to the state, if we found it in the DFA.
1469    Otherwise create the new one and return it.  In case of an error
1470    return NULL and set the error code in ERR.
1471    Note: - We assume NULL as the invalid state, then it is possible that
1472            return value is NULL and ERR is REG_NOERROR.
1473          - We never return non-NULL value in case of any errors, it is for
1474            optimization.  */
1475
1476 static re_dfastate_t *
1477 internal_function __attribute_warn_unused_result__
1478 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479                   const re_node_set *nodes)
1480 {
1481   re_hashval_t hash;
1482   re_dfastate_t *new_state;
1483   struct re_state_table_entry *spot;
1484   Idx i;
1485 #ifdef lint
1486   /* Suppress bogus uninitialized-variable warnings.  */
1487   *err = REG_NOERROR;
1488 #endif
1489   if (BE (nodes->nelem == 0, 0))
1490     {
1491       *err = REG_NOERROR;
1492       return NULL;
1493     }
1494   hash = calc_state_hash (nodes, 0);
1495   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496
1497   for (i = 0 ; i < spot->num ; i++)
1498     {
1499       re_dfastate_t *state = spot->array[i];
1500       if (hash != state->hash)
1501         continue;
1502       if (re_node_set_compare (&state->nodes, nodes))
1503         return state;
1504     }
1505
1506   /* There are no appropriate state in the dfa, create the new one.  */
1507   new_state = create_ci_newstate (dfa, nodes, hash);
1508   if (BE (new_state == NULL, 0))
1509     *err = REG_ESPACE;
1510
1511   return new_state;
1512 }
1513
1514 /* Search for the state whose node_set is equivalent to NODES and
1515    whose context is equivalent to CONTEXT.
1516    Return the pointer to the state, if we found it in the DFA.
1517    Otherwise create the new one and return it.  In case of an error
1518    return NULL and set the error code in ERR.
1519    Note: - We assume NULL as the invalid state, then it is possible that
1520            return value is NULL and ERR is REG_NOERROR.
1521          - We never return non-NULL value in case of any errors, it is for
1522            optimization.  */
1523
1524 static re_dfastate_t *
1525 internal_function __attribute_warn_unused_result__
1526 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1527                           const re_node_set *nodes, unsigned int context)
1528 {
1529   re_hashval_t hash;
1530   re_dfastate_t *new_state;
1531   struct re_state_table_entry *spot;
1532   Idx i;
1533 #ifdef lint
1534   /* Suppress bogus uninitialized-variable warnings.  */
1535   *err = REG_NOERROR;
1536 #endif
1537   if (nodes->nelem == 0)
1538     {
1539       *err = REG_NOERROR;
1540       return NULL;
1541     }
1542   hash = calc_state_hash (nodes, context);
1543   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544
1545   for (i = 0 ; i < spot->num ; i++)
1546     {
1547       re_dfastate_t *state = spot->array[i];
1548       if (state->hash == hash
1549           && state->context == context
1550           && re_node_set_compare (state->entrance_nodes, nodes))
1551         return state;
1552     }
1553   /* There are no appropriate state in 'dfa', create the new one.  */
1554   new_state = create_cd_newstate (dfa, nodes, context, hash);
1555   if (BE (new_state == NULL, 0))
1556     *err = REG_ESPACE;
1557
1558   return new_state;
1559 }
1560
1561 /* Finish initialization of the new state NEWSTATE, and using its hash value
1562    HASH put in the appropriate bucket of DFA's state table.  Return value
1563    indicates the error code if failed.  */
1564
1565 static reg_errcode_t
1566 __attribute_warn_unused_result__
1567 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1568                 re_hashval_t hash)
1569 {
1570   struct re_state_table_entry *spot;
1571   reg_errcode_t err;
1572   Idx i;
1573
1574   newstate->hash = hash;
1575   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576   if (BE (err != REG_NOERROR, 0))
1577     return REG_ESPACE;
1578   for (i = 0; i < newstate->nodes.nelem; i++)
1579     {
1580       Idx elem = newstate->nodes.elems[i];
1581       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1583           return REG_ESPACE;
1584     }
1585
1586   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587   if (BE (spot->alloc <= spot->num, 0))
1588     {
1589       Idx new_alloc = 2 * spot->num + 2;
1590       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591                                               new_alloc);
1592       if (BE (new_array == NULL, 0))
1593         return REG_ESPACE;
1594       spot->array = new_array;
1595       spot->alloc = new_alloc;
1596     }
1597   spot->array[spot->num++] = newstate;
1598   return REG_NOERROR;
1599 }
1600
1601 static void
1602 free_state (re_dfastate_t *state)
1603 {
1604   re_node_set_free (&state->non_eps_nodes);
1605   re_node_set_free (&state->inveclosure);
1606   if (state->entrance_nodes != &state->nodes)
1607     {
1608       re_node_set_free (state->entrance_nodes);
1609       re_free (state->entrance_nodes);
1610     }
1611   re_node_set_free (&state->nodes);
1612   re_free (state->word_trtable);
1613   re_free (state->trtable);
1614   re_free (state);
1615 }
1616
1617 /* Create the new state which is independent of contexts.
1618    Return the new state if succeeded, otherwise return NULL.  */
1619
1620 static re_dfastate_t *
1621 internal_function __attribute_warn_unused_result__
1622 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1623                     re_hashval_t hash)
1624 {
1625   Idx i;
1626   reg_errcode_t err;
1627   re_dfastate_t *newstate;
1628
1629   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1630   if (BE (newstate == NULL, 0))
1631     return NULL;
1632   err = re_node_set_init_copy (&newstate->nodes, nodes);
1633   if (BE (err != REG_NOERROR, 0))
1634     {
1635       re_free (newstate);
1636       return NULL;
1637     }
1638
1639   newstate->entrance_nodes = &newstate->nodes;
1640   for (i = 0 ; i < nodes->nelem ; i++)
1641     {
1642       re_token_t *node = dfa->nodes + nodes->elems[i];
1643       re_token_type_t type = node->type;
1644       if (type == CHARACTER && !node->constraint)
1645         continue;
1646 #ifdef RE_ENABLE_I18N
1647       newstate->accept_mb |= node->accept_mb;
1648 #endif /* RE_ENABLE_I18N */
1649
1650       /* If the state has the halt node, the state is a halt state.  */
1651       if (type == END_OF_RE)
1652         newstate->halt = 1;
1653       else if (type == OP_BACK_REF)
1654         newstate->has_backref = 1;
1655       else if (type == ANCHOR || node->constraint)
1656         newstate->has_constraint = 1;
1657     }
1658   err = register_state (dfa, newstate, hash);
1659   if (BE (err != REG_NOERROR, 0))
1660     {
1661       free_state (newstate);
1662       newstate = NULL;
1663     }
1664   return newstate;
1665 }
1666
1667 /* Create the new state which is depend on the context CONTEXT.
1668    Return the new state if succeeded, otherwise return NULL.  */
1669
1670 static re_dfastate_t *
1671 internal_function __attribute_warn_unused_result__
1672 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1673                     unsigned int context, re_hashval_t hash)
1674 {
1675   Idx i, nctx_nodes = 0;
1676   reg_errcode_t err;
1677   re_dfastate_t *newstate;
1678
1679   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1680   if (BE (newstate == NULL, 0))
1681     return NULL;
1682   err = re_node_set_init_copy (&newstate->nodes, nodes);
1683   if (BE (err != REG_NOERROR, 0))
1684     {
1685       re_free (newstate);
1686       return NULL;
1687     }
1688
1689   newstate->context = context;
1690   newstate->entrance_nodes = &newstate->nodes;
1691
1692   for (i = 0 ; i < nodes->nelem ; i++)
1693     {
1694       re_token_t *node = dfa->nodes + nodes->elems[i];
1695       re_token_type_t type = node->type;
1696       unsigned int constraint = node->constraint;
1697
1698       if (type == CHARACTER && !constraint)
1699         continue;
1700 #ifdef RE_ENABLE_I18N
1701       newstate->accept_mb |= node->accept_mb;
1702 #endif /* RE_ENABLE_I18N */
1703
1704       /* If the state has the halt node, the state is a halt state.  */
1705       if (type == END_OF_RE)
1706         newstate->halt = 1;
1707       else if (type == OP_BACK_REF)
1708         newstate->has_backref = 1;
1709
1710       if (constraint)
1711         {
1712           if (newstate->entrance_nodes == &newstate->nodes)
1713             {
1714               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1715               if (BE (newstate->entrance_nodes == NULL, 0))
1716                 {
1717                   free_state (newstate);
1718                   return NULL;
1719                 }
1720               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1721                   != REG_NOERROR)
1722                 return NULL;
1723               nctx_nodes = 0;
1724               newstate->has_constraint = 1;
1725             }
1726
1727           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1728             {
1729               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1730               ++nctx_nodes;
1731             }
1732         }
1733     }
1734   err = register_state (dfa, newstate, hash);
1735   if (BE (err != REG_NOERROR, 0))
1736     {
1737       free_state (newstate);
1738       newstate = NULL;
1739     }
1740   return  newstate;
1741 }