lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2012 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU Lesser General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public License along
  17    with this program; if not, see <http://www.gnu.org/licenses/>.  */
  18
  19 static void re_string_construct_common (const char *str, Idx len,
  20                                         re_string_t *pstr,
  21                                         RE_TRANSLATE_TYPE trans, bool icase,
  22                                         const re_dfa_t *dfa) internal_function;
  23 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  24                                           const re_node_set *nodes,
  25                                           re_hashval_t hash) internal_function;
  26 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  27                                           const re_node_set *nodes,
  28                                           unsigned int context,
  29                                           re_hashval_t hash) internal_function;
  30 \f
  31 /* Functions for string operation.  */
  32
  33 /* This function allocate the buffers.  It is necessary to call
  34    re_string_reconstruct before using the object.  */
  35
  36 static reg_errcode_t
  37 internal_function __attribute_warn_unused_result__
  38 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  39                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  40 {
  41   reg_errcode_t ret;
  42   Idx init_buf_len;
  43
  44   /* Ensure at least one character fits into the buffers.  */
  45   if (init_len < dfa->mb_cur_max)
  46     init_len = dfa->mb_cur_max;
  47   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  48   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  49
  50   ret = re_string_realloc_buffers (pstr, init_buf_len);
  51   if (BE (ret != REG_NOERROR, 0))
  52     return ret;
  53
  54   pstr->word_char = dfa->word_char;
  55   pstr->word_ops_used = dfa->word_ops_used;
  56   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  57   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  58   pstr->valid_raw_len = pstr->valid_len;
  59   return REG_NOERROR;
  60 }
  61
  62 /* This function allocate the buffers, and initialize them.  */
  63
  64 static reg_errcode_t
  65 internal_function __attribute_warn_unused_result__
  66 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  67                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  68 {
  69   reg_errcode_t ret;
  70   memset (pstr, '\0', sizeof (re_string_t));
  71   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  72
  73   if (len > 0)
  74     {
  75       ret = re_string_realloc_buffers (pstr, len + 1);
  76       if (BE (ret != REG_NOERROR, 0))
  77         return ret;
  78     }
  79   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  80
  81   if (icase)
  82     {
  83 #ifdef RE_ENABLE_I18N
  84       if (dfa->mb_cur_max > 1)
  85         {
  86           while (1)
  87             {
  88               ret = build_wcs_upper_buffer (pstr);
  89               if (BE (ret != REG_NOERROR, 0))
  90                 return ret;
  91               if (pstr->valid_raw_len >= len)
  92                 break;
  93               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  94                 break;
  95               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  96               if (BE (ret != REG_NOERROR, 0))
  97                 return ret;
  98             }
  99         }
 100       else
 101 #endif /* RE_ENABLE_I18N  */
 102         build_upper_buffer (pstr);
 103     }
 104   else
 105     {
 106 #ifdef RE_ENABLE_I18N
 107       if (dfa->mb_cur_max > 1)
 108         build_wcs_buffer (pstr);
 109       else
 110 #endif /* RE_ENABLE_I18N  */
 111         {
 112           if (trans != NULL)
 113             re_string_translate_buffer (pstr);
 114           else
 115             {
 116               pstr->valid_len = pstr->bufs_len;
 117               pstr->valid_raw_len = pstr->bufs_len;
 118             }
 119         }
 120     }
 121
 122   return REG_NOERROR;
 123 }
 124
 125 /* Helper functions for re_string_allocate, and re_string_construct.  */
 126
 127 static reg_errcode_t
 128 internal_function __attribute_warn_unused_result__
 129 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 130 {
 131 #ifdef RE_ENABLE_I18N
 132   if (pstr->mb_cur_max > 1)
 133     {
 134       wint_t *new_wcs;
 135
 136       /* Avoid overflow in realloc.  */
 137       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 138       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
 139         return REG_ESPACE;
 140
 141       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 142       if (BE (new_wcs == NULL, 0))
 143         return REG_ESPACE;
 144       pstr->wcs = new_wcs;
 145       if (pstr->offsets != NULL)
 146         {
 147           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 148           if (BE (new_offsets == NULL, 0))
 149             return REG_ESPACE;
 150           pstr->offsets = new_offsets;
 151         }
 152     }
 153 #endif /* RE_ENABLE_I18N  */
 154   if (pstr->mbs_allocated)
 155     {
 156       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 157                                            new_buf_len);
 158       if (BE (new_mbs == NULL, 0))
 159         return REG_ESPACE;
 160       pstr->mbs = new_mbs;
 161     }
 162   pstr->bufs_len = new_buf_len;
 163   return REG_NOERROR;
 164 }
 165
 166
 167 static void
 168 internal_function
 169 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 170                             RE_TRANSLATE_TYPE trans, bool icase,
 171                             const re_dfa_t *dfa)
 172 {
 173   pstr->raw_mbs = (const unsigned char *) str;
 174   pstr->len = len;
 175   pstr->raw_len = len;
 176   pstr->trans = trans;
 177   pstr->icase = icase;
 178   pstr->mbs_allocated = (trans != NULL || icase);
 179   pstr->mb_cur_max = dfa->mb_cur_max;
 180   pstr->is_utf8 = dfa->is_utf8;
 181   pstr->map_notascii = dfa->map_notascii;
 182   pstr->stop = pstr->len;
 183   pstr->raw_stop = pstr->stop;
 184 }
 185
 186 #ifdef RE_ENABLE_I18N
 187
 188 /* Build wide character buffer PSTR->WCS.
 189    If the byte sequence of the string are:
 190      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 191    Then wide character buffer will be:
 192      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 193    We use WEOF for padding, they indicate that the position isn't
 194    a first byte of a multibyte character.
 195
 196    Note that this function assumes PSTR->VALID_LEN elements are already
 197    built and starts from PSTR->VALID_LEN.  */
 198
 199 static void
 200 internal_function
 201 build_wcs_buffer (re_string_t *pstr)
 202 {
 203 #ifdef _LIBC
 204   unsigned char buf[MB_LEN_MAX];
 205   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 206 #else
 207   unsigned char buf[64];
 208 #endif
 209   mbstate_t prev_st;
 210   Idx byte_idx, end_idx, remain_len;
 211   size_t mbclen;
 212
 213   /* Build the buffers from pstr->valid_len to either pstr->len or
 214      pstr->bufs_len.  */
 215   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 216   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 217     {
 218       wchar_t wc;
 219       const char *p;
 220
 221       remain_len = end_idx - byte_idx;
 222       prev_st = pstr->cur_state;
 223       /* Apply the translation if we need.  */
 224       if (BE (pstr->trans != NULL, 0))
 225         {
 226           int i, ch;
 227
 228           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 229             {
 230               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 231               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 232             }
 233           p = (const char *) buf;
 234         }
 235       else
 236         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 237       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 238       if (BE (mbclen == (size_t) -1 || mbclen == 0
 239               || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
 240         {
 241           /* We treat these cases as a singlebyte character.  */
 242           mbclen = 1;
 243           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 244           if (BE (pstr->trans != NULL, 0))
 245             wc = pstr->trans[wc];
 246           pstr->cur_state = prev_st;
 247         }
 248       else if (BE (mbclen == (size_t) -2, 0))
 249         {
 250           /* The buffer doesn't have enough space, finish to build.  */
 251           pstr->cur_state = prev_st;
 252           break;
 253         }
 254
 255       /* Write wide character and padding.  */
 256       pstr->wcs[byte_idx++] = wc;
 257       /* Write paddings.  */
 258       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 259         pstr->wcs[byte_idx++] = WEOF;
 260     }
 261   pstr->valid_len = byte_idx;
 262   pstr->valid_raw_len = byte_idx;
 263 }
 264
 265 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 266    but for REG_ICASE.  */
 267
 268 static reg_errcode_t
 269 internal_function __attribute_warn_unused_result__
 270 build_wcs_upper_buffer (re_string_t *pstr)
 271 {
 272   mbstate_t prev_st;
 273   Idx src_idx, byte_idx, end_idx, remain_len;
 274   size_t mbclen;
 275 #ifdef _LIBC
 276   char buf[MB_LEN_MAX];
 277   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 278 #else
 279   char buf[64];
 280 #endif
 281
 282   byte_idx = pstr->valid_len;
 283   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 284
 285   /* The following optimization assumes that ASCII characters can be
 286      mapped to wide characters with a simple cast.  */
 287   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 288     {
 289       while (byte_idx < end_idx)
 290         {
 291           wchar_t wc;
 292
 293           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 294               && mbsinit (&pstr->cur_state))
 295             {
 296               /* In case of a singlebyte character.  */
 297               pstr->mbs[byte_idx]
 298                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 299               /* The next step uses the assumption that wchar_t is encoded
 300                  ASCII-safe: all ASCII values can be converted like this.  */
 301               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 302               ++byte_idx;
 303               continue;
 304             }
 305
 306           remain_len = end_idx - byte_idx;
 307           prev_st = pstr->cur_state;
 308           mbclen = __mbrtowc (&wc,
 309                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 310                                + byte_idx), remain_len, &pstr->cur_state);
 311           if (BE (mbclen < (size_t) -2, 1))
 312             {
 313               wchar_t wcu = wc;
 314               if (iswlower (wc))
 315                 {
 316                   size_t mbcdlen;
 317
 318                   wcu = towupper (wc);
 319                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 320                   if (BE (mbclen == mbcdlen, 1))
 321                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 322                   else
 323                     {
 324                       src_idx = byte_idx;
 325                       goto offsets_needed;
 326                     }
 327                 }
 328               else
 329                 memcpy (pstr->mbs + byte_idx,
 330                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 331               pstr->wcs[byte_idx++] = wcu;
 332               /* Write paddings.  */
 333               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 334                 pstr->wcs[byte_idx++] = WEOF;
 335             }
 336           else if (mbclen == (size_t) -1 || mbclen == 0
 337                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 338             {
 339               /* It is an invalid character, an incomplete character
 340                  at the end of the string, or '\0'.  Just use the byte.  */
 341               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 342               pstr->mbs[byte_idx] = ch;
 343               /* And also cast it to wide char.  */
 344               pstr->wcs[byte_idx++] = (wchar_t) ch;
 345               if (BE (mbclen == (size_t) -1, 0))
 346                 pstr->cur_state = prev_st;
 347             }
 348           else
 349             {
 350               /* The buffer doesn't have enough space, finish to build.  */
 351               pstr->cur_state = prev_st;
 352               break;
 353             }
 354         }
 355       pstr->valid_len = byte_idx;
 356       pstr->valid_raw_len = byte_idx;
 357       return REG_NOERROR;
 358     }
 359   else
 360     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 361       {
 362         wchar_t wc;
 363         const char *p;
 364       offsets_needed:
 365         remain_len = end_idx - byte_idx;
 366         prev_st = pstr->cur_state;
 367         if (BE (pstr->trans != NULL, 0))
 368           {
 369             int i, ch;
 370
 371             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 372               {
 373                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 374                 buf[i] = pstr->trans[ch];
 375               }
 376             p = (const char *) buf;
 377           }
 378         else
 379           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 380         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 381         if (BE (mbclen < (size_t) -2, 1))
 382           {
 383             wchar_t wcu = wc;
 384             if (iswlower (wc))
 385               {
 386                 size_t mbcdlen;
 387
 388                 wcu = towupper (wc);
 389                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 390                 if (BE (mbclen == mbcdlen, 1))
 391                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 392                 else if (mbcdlen != (size_t) -1)
 393                   {
 394                     size_t i;
 395
 396                     if (byte_idx + mbcdlen > pstr->bufs_len)
 397                       {
 398                         pstr->cur_state = prev_st;
 399                         break;
 400                       }
 401
 402                     if (pstr->offsets == NULL)
 403                       {
 404                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 405
 406                         if (pstr->offsets == NULL)
 407                           return REG_ESPACE;
 408                       }
 409                     if (!pstr->offsets_needed)
 410                       {
 411                         for (i = 0; i < (size_t) byte_idx; ++i)
 412                           pstr->offsets[i] = i;
 413                         pstr->offsets_needed = 1;
 414                       }
 415
 416                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 417                     pstr->wcs[byte_idx] = wcu;
 418                     pstr->offsets[byte_idx] = src_idx;
 419                     for (i = 1; i < mbcdlen; ++i)
 420                       {
 421                         pstr->offsets[byte_idx + i]
 422                           = src_idx + (i < mbclen ? i : mbclen - 1);
 423                         pstr->wcs[byte_idx + i] = WEOF;
 424                       }
 425                     pstr->len += mbcdlen - mbclen;
 426                     if (pstr->raw_stop > src_idx)
 427                       pstr->stop += mbcdlen - mbclen;
 428                     end_idx = (pstr->bufs_len > pstr->len)
 429                               ? pstr->len : pstr->bufs_len;
 430                     byte_idx += mbcdlen;
 431                     src_idx += mbclen;
 432                     continue;
 433                   }
 434                 else
 435                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 436               }
 437             else
 438               memcpy (pstr->mbs + byte_idx, p, mbclen);
 439
 440             if (BE (pstr->offsets_needed != 0, 0))
 441               {
 442                 size_t i;
 443                 for (i = 0; i < mbclen; ++i)
 444                   pstr->offsets[byte_idx + i] = src_idx + i;
 445               }
 446             src_idx += mbclen;
 447
 448             pstr->wcs[byte_idx++] = wcu;
 449             /* Write paddings.  */
 450             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 451               pstr->wcs[byte_idx++] = WEOF;
 452           }
 453         else if (mbclen == (size_t) -1 || mbclen == 0
 454                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 455           {
 456             /* It is an invalid character or '\0'.  Just use the byte.  */
 457             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 458
 459             if (BE (pstr->trans != NULL, 0))
 460               ch = pstr->trans [ch];
 461             pstr->mbs[byte_idx] = ch;
 462
 463             if (BE (pstr->offsets_needed != 0, 0))
 464               pstr->offsets[byte_idx] = src_idx;
 465             ++src_idx;
 466
 467             /* And also cast it to wide char.  */
 468             pstr->wcs[byte_idx++] = (wchar_t) ch;
 469             if (BE (mbclen == (size_t) -1, 0))
 470               pstr->cur_state = prev_st;
 471           }
 472         else
 473           {
 474             /* The buffer doesn't have enough space, finish to build.  */
 475             pstr->cur_state = prev_st;
 476             break;
 477           }
 478       }
 479   pstr->valid_len = byte_idx;
 480   pstr->valid_raw_len = src_idx;
 481   return REG_NOERROR;
 482 }
 483
 484 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 485    Return the index.  */
 486
 487 static Idx
 488 internal_function
 489 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 490 {
 491   mbstate_t prev_st;
 492   Idx rawbuf_idx;
 493   size_t mbclen;
 494   wint_t wc = WEOF;
 495
 496   /* Skip the characters which are not necessary to check.  */
 497   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 498        rawbuf_idx < new_raw_idx;)
 499     {
 500       wchar_t wc2;
 501       Idx remain_len = pstr->raw_len - rawbuf_idx;
 502       prev_st = pstr->cur_state;
 503       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 504                           remain_len, &pstr->cur_state);
 505       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 506         {
 507           /* We treat these cases as a single byte character.  */
 508           if (mbclen == 0 || remain_len == 0)
 509             wc = L'\0';
 510           else
 511             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 512           mbclen = 1;
 513           pstr->cur_state = prev_st;
 514         }
 515       else
 516         wc = wc2;
 517       /* Then proceed the next character.  */
 518       rawbuf_idx += mbclen;
 519     }
 520   *last_wc = wc;
 521   return rawbuf_idx;
 522 }
 523 #endif /* RE_ENABLE_I18N  */
 524
 525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 526    This function is used in case of REG_ICASE.  */
 527
 528 static void
 529 internal_function
 530 build_upper_buffer (re_string_t *pstr)
 531 {
 532   Idx char_idx, end_idx;
 533   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 534
 535   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 536     {
 537       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 538       if (BE (pstr->trans != NULL, 0))
 539         ch = pstr->trans[ch];
 540       if (islower (ch))
 541         pstr->mbs[char_idx] = toupper (ch);
 542       else
 543         pstr->mbs[char_idx] = ch;
 544     }
 545   pstr->valid_len = char_idx;
 546   pstr->valid_raw_len = char_idx;
 547 }
 548
 549 /* Apply TRANS to the buffer in PSTR.  */
 550
 551 static void
 552 internal_function
 553 re_string_translate_buffer (re_string_t *pstr)
 554 {
 555   Idx buf_idx, end_idx;
 556   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 557
 558   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 559     {
 560       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 561       pstr->mbs[buf_idx] = pstr->trans[ch];
 562     }
 563
 564   pstr->valid_len = buf_idx;
 565   pstr->valid_raw_len = buf_idx;
 566 }
 567
 568 /* This function re-construct the buffers.
 569    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 570    convert to upper case in case of REG_ICASE, apply translation.  */
 571
 572 static reg_errcode_t
 573 internal_function __attribute_warn_unused_result__
 574 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 575 {
 576   Idx offset;
 577
 578   if (BE (pstr->raw_mbs_idx <= idx, 0))
 579     offset = idx - pstr->raw_mbs_idx;
 580   else
 581     {
 582       /* Reset buffer.  */
 583 #ifdef RE_ENABLE_I18N
 584       if (pstr->mb_cur_max > 1)
 585         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 586 #endif /* RE_ENABLE_I18N */
 587       pstr->len = pstr->raw_len;
 588       pstr->stop = pstr->raw_stop;
 589       pstr->valid_len = 0;
 590       pstr->raw_mbs_idx = 0;
 591       pstr->valid_raw_len = 0;
 592       pstr->offsets_needed = 0;
 593       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 594                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 595       if (!pstr->mbs_allocated)
 596         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 597       offset = idx;
 598     }
 599
 600   if (BE (offset != 0, 1))
 601     {
 602       /* Should the already checked characters be kept?  */
 603       if (BE (offset < pstr->valid_raw_len, 1))
 604         {
 605           /* Yes, move them to the front of the buffer.  */
 606 #ifdef RE_ENABLE_I18N
 607           if (BE (pstr->offsets_needed, 0))
 608             {
 609               Idx low = 0, high = pstr->valid_len, mid;
 610               do
 611                 {
 612                   mid = (high + low) / 2;
 613                   if (pstr->offsets[mid] > offset)
 614                     high = mid;
 615                   else if (pstr->offsets[mid] < offset)
 616                     low = mid + 1;
 617                   else
 618                     break;
 619                 }
 620               while (low < high);
 621               if (pstr->offsets[mid] < offset)
 622                 ++mid;
 623               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 624                                                         eflags);
 625               /* This can be quite complicated, so handle specially
 626                  only the common and easy case where the character with
 627                  different length representation of lower and upper
 628                  case is present at or after offset.  */
 629               if (pstr->valid_len > offset
 630                   && mid == offset && pstr->offsets[mid] == offset)
 631                 {
 632                   memmove (pstr->wcs, pstr->wcs + offset,
 633                            (pstr->valid_len - offset) * sizeof (wint_t));
 634                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 635                   pstr->valid_len -= offset;
 636                   pstr->valid_raw_len -= offset;
 637                   for (low = 0; low < pstr->valid_len; low++)
 638                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 639                 }
 640               else
 641                 {
 642                   /* Otherwise, just find out how long the partial multibyte
 643                      character at offset is and fill it with WEOF/255.  */
 644                   pstr->len = pstr->raw_len - idx + offset;
 645                   pstr->stop = pstr->raw_stop - idx + offset;
 646                   pstr->offsets_needed = 0;
 647                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 648                     --mid;
 649                   while (mid < pstr->valid_len)
 650                     if (pstr->wcs[mid] != WEOF)
 651                       break;
 652                     else
 653                       ++mid;
 654                   if (mid == pstr->valid_len)
 655                     pstr->valid_len = 0;
 656                   else
 657                     {
 658                       pstr->valid_len = pstr->offsets[mid] - offset;
 659                       if (pstr->valid_len)
 660                         {
 661                           for (low = 0; low < pstr->valid_len; ++low)
 662                             pstr->wcs[low] = WEOF;
 663                           memset (pstr->mbs, 255, pstr->valid_len);
 664                         }
 665                     }
 666                   pstr->valid_raw_len = pstr->valid_len;
 667                 }
 668             }
 669           else
 670 #endif
 671             {
 672               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 673                                                         eflags);
 674 #ifdef RE_ENABLE_I18N
 675               if (pstr->mb_cur_max > 1)
 676                 memmove (pstr->wcs, pstr->wcs + offset,
 677                          (pstr->valid_len - offset) * sizeof (wint_t));
 678 #endif /* RE_ENABLE_I18N */
 679               if (BE (pstr->mbs_allocated, 0))
 680                 memmove (pstr->mbs, pstr->mbs + offset,
 681                          pstr->valid_len - offset);
 682               pstr->valid_len -= offset;
 683               pstr->valid_raw_len -= offset;
 684 #if DEBUG
 685               assert (pstr->valid_len > 0);
 686 #endif
 687             }
 688         }
 689       else
 690         {
 691 #ifdef RE_ENABLE_I18N
 692           /* No, skip all characters until IDX.  */
 693           Idx prev_valid_len = pstr->valid_len;
 694
 695           if (BE (pstr->offsets_needed, 0))
 696             {
 697               pstr->len = pstr->raw_len - idx + offset;
 698               pstr->stop = pstr->raw_stop - idx + offset;
 699               pstr->offsets_needed = 0;
 700             }
 701 #endif
 702           pstr->valid_len = 0;
 703 #ifdef RE_ENABLE_I18N
 704           if (pstr->mb_cur_max > 1)
 705             {
 706               Idx wcs_idx;
 707               wint_t wc = WEOF;
 708
 709               if (pstr->is_utf8)
 710                 {
 711                   const unsigned char *raw, *p, *end;
 712
 713                   /* Special case UTF-8.  Multi-byte chars start with any
 714                      byte other than 0x80 - 0xbf.  */
 715                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 716                   end = raw + (offset - pstr->mb_cur_max);
 717                   if (end < pstr->raw_mbs)
 718                     end = pstr->raw_mbs;
 719                   p = raw + offset - 1;
 720 #ifdef _LIBC
 721                   /* We know the wchar_t encoding is UCS4, so for the simple
 722                      case, ASCII characters, skip the conversion step.  */
 723                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 724                     {
 725                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 726                       /* pstr->valid_len = 0; */
 727                       wc = (wchar_t) *p;
 728                     }
 729                   else
 730 #endif
 731                     for (; p >= end; --p)
 732                       if ((*p & 0xc0) != 0x80)
 733                         {
 734                           mbstate_t cur_state;
 735                           wchar_t wc2;
 736                           Idx mlen = raw + pstr->len - p;
 737                           unsigned char buf[6];
 738                           size_t mbclen;
 739
 740                           const unsigned char *pp = p;
 741                           if (BE (pstr->trans != NULL, 0))
 742                             {
 743                               int i = mlen < 6 ? mlen : 6;
 744                               while (--i >= 0)
 745                                 buf[i] = pstr->trans[p[i]];
 746                               pp = buf;
 747                             }
 748                           /* XXX Don't use mbrtowc, we know which conversion
 749                              to use (UTF-8 -> UCS4).  */
 750                           memset (&cur_state, 0, sizeof (cur_state));
 751                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 752                                               &cur_state);
 753                           if (raw + offset - p <= mbclen
 754                               && mbclen < (size_t) -2)
 755                             {
 756                               memset (&pstr->cur_state, '\0',
 757                                       sizeof (mbstate_t));
 758                               pstr->valid_len = mbclen - (raw + offset - p);
 759                               wc = wc2;
 760                             }
 761                           break;
 762                         }
 763                 }
 764
 765               if (wc == WEOF)
 766                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 767               if (wc == WEOF)
 768                 pstr->tip_context
 769                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 770               else
 771                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 772                                       && IS_WIDE_WORD_CHAR (wc))
 773                                      ? CONTEXT_WORD
 774                                      : ((IS_WIDE_NEWLINE (wc)
 775                                          && pstr->newline_anchor)
 776                                         ? CONTEXT_NEWLINE : 0));
 777               if (BE (pstr->valid_len, 0))
 778                 {
 779                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 780                     pstr->wcs[wcs_idx] = WEOF;
 781                   if (pstr->mbs_allocated)
 782                     memset (pstr->mbs, 255, pstr->valid_len);
 783                 }
 784               pstr->valid_raw_len = pstr->valid_len;
 785             }
 786           else
 787 #endif /* RE_ENABLE_I18N */
 788             {
 789               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 790               pstr->valid_raw_len = 0;
 791               if (pstr->trans)
 792                 c = pstr->trans[c];
 793               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 794                                    ? CONTEXT_WORD
 795                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 796                                       ? CONTEXT_NEWLINE : 0));
 797             }
 798         }
 799       if (!BE (pstr->mbs_allocated, 0))
 800         pstr->mbs += offset;
 801     }
 802   pstr->raw_mbs_idx = idx;
 803   pstr->len -= offset;
 804   pstr->stop -= offset;
 805
 806   /* Then build the buffers.  */
 807 #ifdef RE_ENABLE_I18N
 808   if (pstr->mb_cur_max > 1)
 809     {
 810       if (pstr->icase)
 811         {
 812           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 813           if (BE (ret != REG_NOERROR, 0))
 814             return ret;
 815         }
 816       else
 817         build_wcs_buffer (pstr);
 818     }
 819   else
 820 #endif /* RE_ENABLE_I18N */
 821     if (BE (pstr->mbs_allocated, 0))
 822       {
 823         if (pstr->icase)
 824           build_upper_buffer (pstr);
 825         else if (pstr->trans != NULL)
 826           re_string_translate_buffer (pstr);
 827       }
 828     else
 829       pstr->valid_len = pstr->len;
 830
 831   pstr->cur_idx = 0;
 832   return REG_NOERROR;
 833 }
 834
 835 static unsigned char
 836 internal_function __attribute ((pure))
 837 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 838 {
 839   int ch;
 840   Idx off;
 841
 842   /* Handle the common (easiest) cases first.  */
 843   if (BE (!pstr->mbs_allocated, 1))
 844     return re_string_peek_byte (pstr, idx);
 845
 846 #ifdef RE_ENABLE_I18N
 847   if (pstr->mb_cur_max > 1
 848       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 849     return re_string_peek_byte (pstr, idx);
 850 #endif
 851
 852   off = pstr->cur_idx + idx;
 853 #ifdef RE_ENABLE_I18N
 854   if (pstr->offsets_needed)
 855     off = pstr->offsets[off];
 856 #endif
 857
 858   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 859
 860 #ifdef RE_ENABLE_I18N
 861   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 862      this function returns CAPITAL LETTER I instead of first byte of
 863      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 864      since peek_byte_case doesn't advance cur_idx in any way.  */
 865   if (pstr->offsets_needed && !isascii (ch))
 866     return re_string_peek_byte (pstr, idx);
 867 #endif
 868
 869   return ch;
 870 }
 871
 872 static unsigned char
 873 internal_function
 874 re_string_fetch_byte_case (re_string_t *pstr)
 875 {
 876   if (BE (!pstr->mbs_allocated, 1))
 877     return re_string_fetch_byte (pstr);
 878
 879 #ifdef RE_ENABLE_I18N
 880   if (pstr->offsets_needed)
 881     {
 882       Idx off;
 883       int ch;
 884
 885       /* For tr_TR.UTF-8 [[:islower:]] there is
 886          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 887          in that case the whole multi-byte character and return
 888          the original letter.  On the other side, with
 889          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 890          anything else would complicate things too much.  */
 891
 892       if (!re_string_first_byte (pstr, pstr->cur_idx))
 893         return re_string_fetch_byte (pstr);
 894
 895       off = pstr->offsets[pstr->cur_idx];
 896       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 897
 898       if (! isascii (ch))
 899         return re_string_fetch_byte (pstr);
 900
 901       re_string_skip_bytes (pstr,
 902                             re_string_char_size_at (pstr, pstr->cur_idx));
 903       return ch;
 904     }
 905 #endif
 906
 907   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 908 }
 909
 910 static void
 911 internal_function
 912 re_string_destruct (re_string_t *pstr)
 913 {
 914 #ifdef RE_ENABLE_I18N
 915   re_free (pstr->wcs);
 916   re_free (pstr->offsets);
 917 #endif /* RE_ENABLE_I18N  */
 918   if (pstr->mbs_allocated)
 919     re_free (pstr->mbs);
 920 }
 921
 922 /* Return the context at IDX in INPUT.  */
 923
 924 static unsigned int
 925 internal_function
 926 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 927 {
 928   int c;
 929   if (BE (! REG_VALID_INDEX (idx), 0))
 930     /* In this case, we use the value stored in input->tip_context,
 931        since we can't know the character in input->mbs[-1] here.  */
 932     return input->tip_context;
 933   if (BE (idx == input->len, 0))
 934     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 935             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 936 #ifdef RE_ENABLE_I18N
 937   if (input->mb_cur_max > 1)
 938     {
 939       wint_t wc;
 940       Idx wc_idx = idx;
 941       while(input->wcs[wc_idx] == WEOF)
 942         {
 943 #ifdef DEBUG
 944           /* It must not happen.  */
 945           assert (REG_VALID_INDEX (wc_idx));
 946 #endif
 947           --wc_idx;
 948           if (! REG_VALID_INDEX (wc_idx))
 949             return input->tip_context;
 950         }
 951       wc = input->wcs[wc_idx];
 952       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 953         return CONTEXT_WORD;
 954       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 955               ? CONTEXT_NEWLINE : 0);
 956     }
 957   else
 958 #endif
 959     {
 960       c = re_string_byte_at (input, idx);
 961       if (bitset_contain (input->word_char, c))
 962         return CONTEXT_WORD;
 963       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 964     }
 965 }
 966 \f
 967 /* Functions for set operation.  */
 968
 969 static reg_errcode_t
 970 internal_function __attribute_warn_unused_result__
 971 re_node_set_alloc (re_node_set *set, Idx size)
 972 {
 973   set->alloc = size;
 974   set->nelem = 0;
 975   set->elems = re_malloc (Idx, size);
 976   if (BE (set->elems == NULL, 0))
 977     return REG_ESPACE;
 978   return REG_NOERROR;
 979 }
 980
 981 static reg_errcode_t
 982 internal_function __attribute_warn_unused_result__
 983 re_node_set_init_1 (re_node_set *set, Idx elem)
 984 {
 985   set->alloc = 1;
 986   set->nelem = 1;
 987   set->elems = re_malloc (Idx, 1);
 988   if (BE (set->elems == NULL, 0))
 989     {
 990       set->alloc = set->nelem = 0;
 991       return REG_ESPACE;
 992     }
 993   set->elems[0] = elem;
 994   return REG_NOERROR;
 995 }
 996
 997 static reg_errcode_t
 998 internal_function __attribute_warn_unused_result__
 999 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1000 {
1001   set->alloc = 2;
1002   set->elems = re_malloc (Idx, 2);
1003   if (BE (set->elems == NULL, 0))
1004     return REG_ESPACE;
1005   if (elem1 == elem2)
1006     {
1007       set->nelem = 1;
1008       set->elems[0] = elem1;
1009     }
1010   else
1011     {
1012       set->nelem = 2;
1013       if (elem1 < elem2)
1014         {
1015           set->elems[0] = elem1;
1016           set->elems[1] = elem2;
1017         }
1018       else
1019         {
1020           set->elems[0] = elem2;
1021           set->elems[1] = elem1;
1022         }
1023     }
1024   return REG_NOERROR;
1025 }
1026
1027 static reg_errcode_t
1028 internal_function __attribute_warn_unused_result__
1029 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1030 {
1031   dest->nelem = src->nelem;
1032   if (src->nelem > 0)
1033     {
1034       dest->alloc = dest->nelem;
1035       dest->elems = re_malloc (Idx, dest->alloc);
1036       if (BE (dest->elems == NULL, 0))
1037         {
1038           dest->alloc = dest->nelem = 0;
1039           return REG_ESPACE;
1040         }
1041       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1042     }
1043   else
1044     re_node_set_init_empty (dest);
1045   return REG_NOERROR;
1046 }
1047
1048 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1049    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1050    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1051
1052 static reg_errcode_t
1053 internal_function __attribute_warn_unused_result__
1054 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1055                            const re_node_set *src2)
1056 {
1057   Idx i1, i2, is, id, delta, sbase;
1058   if (src1->nelem == 0 || src2->nelem == 0)
1059     return REG_NOERROR;
1060
1061   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1062      conservative estimate.  */
1063   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1064     {
1065       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1066       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1067       if (BE (new_elems == NULL, 0))
1068         return REG_ESPACE;
1069       dest->elems = new_elems;
1070       dest->alloc = new_alloc;
1071     }
1072
1073   /* Find the items in the intersection of SRC1 and SRC2, and copy
1074      into the top of DEST those that are not already in DEST itself.  */
1075   sbase = dest->nelem + src1->nelem + src2->nelem;
1076   i1 = src1->nelem - 1;
1077   i2 = src2->nelem - 1;
1078   id = dest->nelem - 1;
1079   for (;;)
1080     {
1081       if (src1->elems[i1] == src2->elems[i2])
1082         {
1083           /* Try to find the item in DEST.  Maybe we could binary search?  */
1084           while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1085             --id;
1086
1087           if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1088             dest->elems[--sbase] = src1->elems[i1];
1089
1090           if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1091             break;
1092         }
1093
1094       /* Lower the highest of the two items.  */
1095       else if (src1->elems[i1] < src2->elems[i2])
1096         {
1097           if (! REG_VALID_INDEX (--i2))
1098             break;
1099         }
1100       else
1101         {
1102           if (! REG_VALID_INDEX (--i1))
1103             break;
1104         }
1105     }
1106
1107   id = dest->nelem - 1;
1108   is = dest->nelem + src1->nelem + src2->nelem - 1;
1109   delta = is - sbase + 1;
1110
1111   /* Now copy.  When DELTA becomes zero, the remaining
1112      DEST elements are already in place; this is more or
1113      less the same loop that is in re_node_set_merge.  */
1114   dest->nelem += delta;
1115   if (delta > 0 && REG_VALID_INDEX (id))
1116     for (;;)
1117       {
1118         if (dest->elems[is] > dest->elems[id])
1119           {
1120             /* Copy from the top.  */
1121             dest->elems[id + delta--] = dest->elems[is--];
1122             if (delta == 0)
1123               break;
1124           }
1125         else
1126           {
1127             /* Slide from the bottom.  */
1128             dest->elems[id + delta] = dest->elems[id];
1129             if (! REG_VALID_INDEX (--id))
1130               break;
1131           }
1132       }
1133
1134   /* Copy remaining SRC elements.  */
1135   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1136
1137   return REG_NOERROR;
1138 }
1139
1140 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1141    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1142
1143 static reg_errcode_t
1144 internal_function __attribute_warn_unused_result__
1145 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1146                         const re_node_set *src2)
1147 {
1148   Idx i1, i2, id;
1149   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1150     {
1151       dest->alloc = src1->nelem + src2->nelem;
1152       dest->elems = re_malloc (Idx, dest->alloc);
1153       if (BE (dest->elems == NULL, 0))
1154         return REG_ESPACE;
1155     }
1156   else
1157     {
1158       if (src1 != NULL && src1->nelem > 0)
1159         return re_node_set_init_copy (dest, src1);
1160       else if (src2 != NULL && src2->nelem > 0)
1161         return re_node_set_init_copy (dest, src2);
1162       else
1163         re_node_set_init_empty (dest);
1164       return REG_NOERROR;
1165     }
1166   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1167     {
1168       if (src1->elems[i1] > src2->elems[i2])
1169         {
1170           dest->elems[id++] = src2->elems[i2++];
1171           continue;
1172         }
1173       if (src1->elems[i1] == src2->elems[i2])
1174         ++i2;
1175       dest->elems[id++] = src1->elems[i1++];
1176     }
1177   if (i1 < src1->nelem)
1178     {
1179       memcpy (dest->elems + id, src1->elems + i1,
1180              (src1->nelem - i1) * sizeof (Idx));
1181       id += src1->nelem - i1;
1182     }
1183   else if (i2 < src2->nelem)
1184     {
1185       memcpy (dest->elems + id, src2->elems + i2,
1186              (src2->nelem - i2) * sizeof (Idx));
1187       id += src2->nelem - i2;
1188     }
1189   dest->nelem = id;
1190   return REG_NOERROR;
1191 }
1192
1193 /* Calculate the union set of the sets DEST and SRC. And store it to
1194    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1195
1196 static reg_errcode_t
1197 internal_function __attribute_warn_unused_result__
1198 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1199 {
1200   Idx is, id, sbase, delta;
1201   if (src == NULL || src->nelem == 0)
1202     return REG_NOERROR;
1203   if (dest->alloc < 2 * src->nelem + dest->nelem)
1204     {
1205       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1206       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1207       if (BE (new_buffer == NULL, 0))
1208         return REG_ESPACE;
1209       dest->elems = new_buffer;
1210       dest->alloc = new_alloc;
1211     }
1212
1213   if (BE (dest->nelem == 0, 0))
1214     {
1215       dest->nelem = src->nelem;
1216       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1217       return REG_NOERROR;
1218     }
1219
1220   /* Copy into the top of DEST the items of SRC that are not
1221      found in DEST.  Maybe we could binary search in DEST?  */
1222   for (sbase = dest->nelem + 2 * src->nelem,
1223        is = src->nelem - 1, id = dest->nelem - 1;
1224        REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1225     {
1226       if (dest->elems[id] == src->elems[is])
1227         is--, id--;
1228       else if (dest->elems[id] < src->elems[is])
1229         dest->elems[--sbase] = src->elems[is--];
1230       else /* if (dest->elems[id] > src->elems[is]) */
1231         --id;
1232     }
1233
1234   if (REG_VALID_INDEX (is))
1235     {
1236       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1237       sbase -= is + 1;
1238       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1239     }
1240
1241   id = dest->nelem - 1;
1242   is = dest->nelem + 2 * src->nelem - 1;
1243   delta = is - sbase + 1;
1244   if (delta == 0)
1245     return REG_NOERROR;
1246
1247   /* Now copy.  When DELTA becomes zero, the remaining
1248      DEST elements are already in place.  */
1249   dest->nelem += delta;
1250   for (;;)
1251     {
1252       if (dest->elems[is] > dest->elems[id])
1253         {
1254           /* Copy from the top.  */
1255           dest->elems[id + delta--] = dest->elems[is--];
1256           if (delta == 0)
1257             break;
1258         }
1259       else
1260         {
1261           /* Slide from the bottom.  */
1262           dest->elems[id + delta] = dest->elems[id];
1263           if (! REG_VALID_INDEX (--id))
1264             {
1265               /* Copy remaining SRC elements.  */
1266               memcpy (dest->elems, dest->elems + sbase,
1267                       delta * sizeof (Idx));
1268               break;
1269             }
1270         }
1271     }
1272
1273   return REG_NOERROR;
1274 }
1275
1276 /* Insert the new element ELEM to the re_node_set* SET.
1277    SET should not already have ELEM.
1278    Return true if successful.  */
1279
1280 static bool
1281 internal_function __attribute_warn_unused_result__
1282 re_node_set_insert (re_node_set *set, Idx elem)
1283 {
1284   Idx idx;
1285   /* In case the set is empty.  */
1286   if (set->alloc == 0)
1287     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1288
1289   if (BE (set->nelem, 0) == 0)
1290     {
1291       /* We already guaranteed above that set->alloc != 0.  */
1292       set->elems[0] = elem;
1293       ++set->nelem;
1294       return true;
1295     }
1296
1297   /* Realloc if we need.  */
1298   if (set->alloc == set->nelem)
1299     {
1300       Idx *new_elems;
1301       set->alloc = set->alloc * 2;
1302       new_elems = re_realloc (set->elems, Idx, set->alloc);
1303       if (BE (new_elems == NULL, 0))
1304         return false;
1305       set->elems = new_elems;
1306     }
1307
1308   /* Move the elements which follows the new element.  Test the
1309      first element separately to skip a check in the inner loop.  */
1310   if (elem < set->elems[0])
1311     {
1312       idx = 0;
1313       for (idx = set->nelem; idx > 0; idx--)
1314         set->elems[idx] = set->elems[idx - 1];
1315     }
1316   else
1317     {
1318       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319         set->elems[idx] = set->elems[idx - 1];
1320     }
1321
1322   /* Insert the new element.  */
1323   set->elems[idx] = elem;
1324   ++set->nelem;
1325   return true;
1326 }
1327
1328 /* Insert the new element ELEM to the re_node_set* SET.
1329    SET should not already have any element greater than or equal to ELEM.
1330    Return true if successful.  */
1331
1332 static bool
1333 internal_function __attribute_warn_unused_result__
1334 re_node_set_insert_last (re_node_set *set, Idx elem)
1335 {
1336   /* Realloc if we need.  */
1337   if (set->alloc == set->nelem)
1338     {
1339       Idx *new_elems;
1340       set->alloc = (set->alloc + 1) * 2;
1341       new_elems = re_realloc (set->elems, Idx, set->alloc);
1342       if (BE (new_elems == NULL, 0))
1343         return false;
1344       set->elems = new_elems;
1345     }
1346
1347   /* Insert the new element.  */
1348   set->elems[set->nelem++] = elem;
1349   return true;
1350 }
1351
1352 /* Compare two node sets SET1 and SET2.
1353    Return true if SET1 and SET2 are equivalent.  */
1354
1355 static bool
1356 internal_function __attribute ((pure))
1357 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1358 {
1359   Idx i;
1360   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1361     return false;
1362   for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1363     if (set1->elems[i] != set2->elems[i])
1364       return false;
1365   return true;
1366 }
1367
1368 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1369
1370 static Idx
1371 internal_function __attribute ((pure))
1372 re_node_set_contains (const re_node_set *set, Idx elem)
1373 {
1374   __re_size_t idx, right, mid;
1375   if (! REG_VALID_NONZERO_INDEX (set->nelem))
1376     return 0;
1377
1378   /* Binary search the element.  */
1379   idx = 0;
1380   right = set->nelem - 1;
1381   while (idx < right)
1382     {
1383       mid = (idx + right) / 2;
1384       if (set->elems[mid] < elem)
1385         idx = mid + 1;
1386       else
1387         right = mid;
1388     }
1389   return set->elems[idx] == elem ? idx + 1 : 0;
1390 }
1391
1392 static void
1393 internal_function
1394 re_node_set_remove_at (re_node_set *set, Idx idx)
1395 {
1396   if (idx < 0 || idx >= set->nelem)
1397     return;
1398   --set->nelem;
1399   for (; idx < set->nelem; idx++)
1400     set->elems[idx] = set->elems[idx + 1];
1401 }
1402 \f
1403
1404 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1405    Or return REG_MISSING if an error occurred.  */
1406
1407 static Idx
1408 internal_function
1409 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1410 {
1411   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1412     {
1413       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1414       Idx *new_nexts, *new_indices;
1415       re_node_set *new_edests, *new_eclosures;
1416       re_token_t *new_nodes;
1417
1418       /* Avoid overflows in realloc.  */
1419       const size_t max_object_size = MAX (sizeof (re_token_t),
1420                                           MAX (sizeof (re_node_set),
1421                                                sizeof (Idx)));
1422       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1423         return REG_MISSING;
1424
1425       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1426       if (BE (new_nodes == NULL, 0))
1427         return REG_MISSING;
1428       dfa->nodes = new_nodes;
1429       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1430       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1431       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1432       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1433       if (BE (new_nexts == NULL || new_indices == NULL
1434               || new_edests == NULL || new_eclosures == NULL, 0))
1435         return REG_MISSING;
1436       dfa->nexts = new_nexts;
1437       dfa->org_indices = new_indices;
1438       dfa->edests = new_edests;
1439       dfa->eclosures = new_eclosures;
1440       dfa->nodes_alloc = new_nodes_alloc;
1441     }
1442   dfa->nodes[dfa->nodes_len] = token;
1443   dfa->nodes[dfa->nodes_len].constraint = 0;
1444 #ifdef RE_ENABLE_I18N
1445   {
1446   int type = token.type;
1447   dfa->nodes[dfa->nodes_len].accept_mb =
1448     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1449   }
1450 #endif
1451   dfa->nexts[dfa->nodes_len] = REG_MISSING;
1452   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1453   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1454   return dfa->nodes_len++;
1455 }
1456
1457 static inline re_hashval_t
1458 internal_function
1459 calc_state_hash (const re_node_set *nodes, unsigned int context)
1460 {
1461   re_hashval_t hash = nodes->nelem + context;
1462   Idx i;
1463   for (i = 0 ; i < nodes->nelem ; i++)
1464     hash += nodes->elems[i];
1465   return hash;
1466 }
1467
1468 /* Search for the state whose node_set is equivalent to NODES.
1469    Return the pointer to the state, if we found it in the DFA.
1470    Otherwise create the new one and return it.  In case of an error
1471    return NULL and set the error code in ERR.
1472    Note: - We assume NULL as the invalid state, then it is possible that
1473            return value is NULL and ERR is REG_NOERROR.
1474          - We never return non-NULL value in case of any errors, it is for
1475            optimization.  */
1476
1477 static re_dfastate_t *
1478 internal_function __attribute_warn_unused_result__
1479 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1480                   const re_node_set *nodes)
1481 {
1482   re_hashval_t hash;
1483   re_dfastate_t *new_state;
1484   struct re_state_table_entry *spot;
1485   Idx i;
1486 #ifdef lint
1487   /* Suppress bogus uninitialized-variable warnings.  */
1488   *err = REG_NOERROR;
1489 #endif
1490   if (BE (nodes->nelem == 0, 0))
1491     {
1492       *err = REG_NOERROR;
1493       return NULL;
1494     }
1495   hash = calc_state_hash (nodes, 0);
1496   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1497
1498   for (i = 0 ; i < spot->num ; i++)
1499     {
1500       re_dfastate_t *state = spot->array[i];
1501       if (hash != state->hash)
1502         continue;
1503       if (re_node_set_compare (&state->nodes, nodes))
1504         return state;
1505     }
1506
1507   /* There are no appropriate state in the dfa, create the new one.  */
1508   new_state = create_ci_newstate (dfa, nodes, hash);
1509   if (BE (new_state == NULL, 0))
1510     *err = REG_ESPACE;
1511
1512   return new_state;
1513 }
1514
1515 /* Search for the state whose node_set is equivalent to NODES and
1516    whose context is equivalent to CONTEXT.
1517    Return the pointer to the state, if we found it in the DFA.
1518    Otherwise create the new one and return it.  In case of an error
1519    return NULL and set the error code in ERR.
1520    Note: - We assume NULL as the invalid state, then it is possible that
1521            return value is NULL and ERR is REG_NOERROR.
1522          - We never return non-NULL value in case of any errors, it is for
1523            optimization.  */
1524
1525 static re_dfastate_t *
1526 internal_function __attribute_warn_unused_result__
1527 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1528                           const re_node_set *nodes, unsigned int context)
1529 {
1530   re_hashval_t hash;
1531   re_dfastate_t *new_state;
1532   struct re_state_table_entry *spot;
1533   Idx i;
1534 #ifdef lint
1535   /* Suppress bogus uninitialized-variable warnings.  */
1536   *err = REG_NOERROR;
1537 #endif
1538   if (nodes->nelem == 0)
1539     {
1540       *err = REG_NOERROR;
1541       return NULL;
1542     }
1543   hash = calc_state_hash (nodes, context);
1544   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1545
1546   for (i = 0 ; i < spot->num ; i++)
1547     {
1548       re_dfastate_t *state = spot->array[i];
1549       if (state->hash == hash
1550           && state->context == context
1551           && re_node_set_compare (state->entrance_nodes, nodes))
1552         return state;
1553     }
1554   /* There are no appropriate state in 'dfa', create the new one.  */
1555   new_state = create_cd_newstate (dfa, nodes, context, hash);
1556   if (BE (new_state == NULL, 0))
1557     *err = REG_ESPACE;
1558
1559   return new_state;
1560 }
1561
1562 /* Finish initialization of the new state NEWSTATE, and using its hash value
1563    HASH put in the appropriate bucket of DFA's state table.  Return value
1564    indicates the error code if failed.  */
1565
1566 static reg_errcode_t
1567 __attribute_warn_unused_result__
1568 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1569                 re_hashval_t hash)
1570 {
1571   struct re_state_table_entry *spot;
1572   reg_errcode_t err;
1573   Idx i;
1574
1575   newstate->hash = hash;
1576   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1577   if (BE (err != REG_NOERROR, 0))
1578     return REG_ESPACE;
1579   for (i = 0; i < newstate->nodes.nelem; i++)
1580     {
1581       Idx elem = newstate->nodes.elems[i];
1582       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1583         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1584           return REG_ESPACE;
1585     }
1586
1587   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1588   if (BE (spot->alloc <= spot->num, 0))
1589     {
1590       Idx new_alloc = 2 * spot->num + 2;
1591       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1592                                               new_alloc);
1593       if (BE (new_array == NULL, 0))
1594         return REG_ESPACE;
1595       spot->array = new_array;
1596       spot->alloc = new_alloc;
1597     }
1598   spot->array[spot->num++] = newstate;
1599   return REG_NOERROR;
1600 }
1601
1602 static void
1603 free_state (re_dfastate_t *state)
1604 {
1605   re_node_set_free (&state->non_eps_nodes);
1606   re_node_set_free (&state->inveclosure);
1607   if (state->entrance_nodes != &state->nodes)
1608     {
1609       re_node_set_free (state->entrance_nodes);
1610       re_free (state->entrance_nodes);
1611     }
1612   re_node_set_free (&state->nodes);
1613   re_free (state->word_trtable);
1614   re_free (state->trtable);
1615   re_free (state);
1616 }
1617
1618 /* Create the new state which is independent of contexts.
1619    Return the new state if succeeded, otherwise return NULL.  */
1620
1621 static re_dfastate_t *
1622 internal_function __attribute_warn_unused_result__
1623 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1624                     re_hashval_t hash)
1625 {
1626   Idx i;
1627   reg_errcode_t err;
1628   re_dfastate_t *newstate;
1629
1630   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1631   if (BE (newstate == NULL, 0))
1632     return NULL;
1633   err = re_node_set_init_copy (&newstate->nodes, nodes);
1634   if (BE (err != REG_NOERROR, 0))
1635     {
1636       re_free (newstate);
1637       return NULL;
1638     }
1639
1640   newstate->entrance_nodes = &newstate->nodes;
1641   for (i = 0 ; i < nodes->nelem ; i++)
1642     {
1643       re_token_t *node = dfa->nodes + nodes->elems[i];
1644       re_token_type_t type = node->type;
1645       if (type == CHARACTER && !node->constraint)
1646         continue;
1647 #ifdef RE_ENABLE_I18N
1648       newstate->accept_mb |= node->accept_mb;
1649 #endif /* RE_ENABLE_I18N */
1650
1651       /* If the state has the halt node, the state is a halt state.  */
1652       if (type == END_OF_RE)
1653         newstate->halt = 1;
1654       else if (type == OP_BACK_REF)
1655         newstate->has_backref = 1;
1656       else if (type == ANCHOR || node->constraint)
1657         newstate->has_constraint = 1;
1658     }
1659   err = register_state (dfa, newstate, hash);
1660   if (BE (err != REG_NOERROR, 0))
1661     {
1662       free_state (newstate);
1663       newstate = NULL;
1664     }
1665   return newstate;
1666 }
1667
1668 /* Create the new state which is depend on the context CONTEXT.
1669    Return the new state if succeeded, otherwise return NULL.  */
1670
1671 static re_dfastate_t *
1672 internal_function __attribute_warn_unused_result__
1673 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1674                     unsigned int context, re_hashval_t hash)
1675 {
1676   Idx i, nctx_nodes = 0;
1677   reg_errcode_t err;
1678   re_dfastate_t *newstate;
1679
1680   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1681   if (BE (newstate == NULL, 0))
1682     return NULL;
1683   err = re_node_set_init_copy (&newstate->nodes, nodes);
1684   if (BE (err != REG_NOERROR, 0))
1685     {
1686       re_free (newstate);
1687       return NULL;
1688     }
1689
1690   newstate->context = context;
1691   newstate->entrance_nodes = &newstate->nodes;
1692
1693   for (i = 0 ; i < nodes->nelem ; i++)
1694     {
1695       re_token_t *node = dfa->nodes + nodes->elems[i];
1696       re_token_type_t type = node->type;
1697       unsigned int constraint = node->constraint;
1698
1699       if (type == CHARACTER && !constraint)
1700         continue;
1701 #ifdef RE_ENABLE_I18N
1702       newstate->accept_mb |= node->accept_mb;
1703 #endif /* RE_ENABLE_I18N */
1704
1705       /* If the state has the halt node, the state is a halt state.  */
1706       if (type == END_OF_RE)
1707         newstate->halt = 1;
1708       else if (type == OP_BACK_REF)
1709         newstate->has_backref = 1;
1710
1711       if (constraint)
1712         {
1713           if (newstate->entrance_nodes == &newstate->nodes)
1714             {
1715               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1716               if (BE (newstate->entrance_nodes == NULL, 0))
1717                 {
1718                   free_state (newstate);
1719                   return NULL;
1720                 }
1721               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1722                   != REG_NOERROR)
1723                 return NULL;
1724               nctx_nodes = 0;
1725               newstate->has_constraint = 1;
1726             }
1727
1728           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1729             {
1730               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1731               ++nctx_nodes;
1732             }
1733         }
1734     }
1735   err = register_state (dfa, newstate, hash);
1736   if (BE (err != REG_NOERROR, 0))
1737     {
1738       free_state (newstate);
1739       newstate = NULL;
1740     }
1741   return  newstate;
1742 }