Merge remote-tracking branch 'local-2.0/stable-2.0'
[bpt/guile.git] / lib / regcomp.c
CommitLineData
eb4a14ed
LC
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License along
17 with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26#ifdef RE_ENABLE_I18N
27static void free_charset (re_charset_t *cset);
28#endif /* RE_ENABLE_I18N */
29static void free_workarea_compile (regex_t *preg);
30static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31#ifdef RE_ENABLE_I18N
32static void optimize_utf8 (re_dfa_t *dfa);
33#endif
34static reg_errcode_t analyze (regex_t *preg);
35static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88#ifdef RE_ENABLE_I18N
89static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 Idx *char_class_alloc,
97 const unsigned char *class_name,
98 reg_syntax_t syntax);
99#else /* not RE_ENABLE_I18N */
100static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const unsigned char *class_name,
105 reg_syntax_t syntax);
106#endif /* not RE_ENABLE_I18N */
107static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const unsigned char *class_name,
110 const unsigned char *extra,
111 bool non_match, reg_errcode_t *err);
112static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119static void free_token (re_token_t *node);
120static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122\f
123/* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
127
128static const char __re_error_msgid[] =
129 {
130#define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
181
182static const size_t __re_error_msgid_idx[] =
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202\f
203/* Entry points for GNU code. */
204
205/* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
208
209 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
210 are set in BUFP on entry. */
211
212#ifdef _LIBC
213const char *
214re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
218#else /* size_t might promote */
219const char *
220re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
222#endif
223{
224 reg_errcode_t ret;
225
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
233
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235
236 if (!ret)
237 return NULL;
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239}
240#ifdef _LIBC
241weak_alias (__re_compile_pattern, re_compile_pattern)
242#endif
243
244/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247/* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249reg_syntax_t re_syntax_options;
250
251
252/* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
255
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
258
259reg_syntax_t
260re_set_syntax (syntax)
261 reg_syntax_t syntax;
262{
263 reg_syntax_t ret = re_syntax_options;
264
265 re_syntax_options = syntax;
266 return ret;
267}
268#ifdef _LIBC
269weak_alias (__re_set_syntax, re_set_syntax)
270#endif
271
272int
273re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
275{
276 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
277 char *fastmap = bufp->fastmap;
278
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
288 return 0;
289}
290#ifdef _LIBC
291weak_alias (__re_compile_fastmap, re_compile_fastmap)
292#endif
293
294static inline void
295__attribute ((always_inline))
296re_set_fastmap (char *fastmap, bool icase, int ch)
297{
298 fastmap[ch] = 1;
299 if (icase)
300 fastmap[tolower (ch)] = 1;
301}
302
303/* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
305
306static void
307re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 char *fastmap)
309{
310 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
311 Idx node_cnt;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 {
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
317
318 if (type == CHARACTER)
319 {
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321#ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 {
324 unsigned char buf[MB_LEN_MAX];
325 unsigned char *p;
326 wchar_t wc;
327 mbstate_t state;
328
329 p = buf;
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (__mbrtowc (&wc, (const char *) buf, p - buf,
337 &state) == p - buf
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 != (size_t) -1))
340 re_set_fastmap (fastmap, false, buf[0]);
341 }
342#endif
343 }
344 else if (type == SIMPLE_BRACKET)
345 {
346 int i, ch;
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348 {
349 int j;
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
354 }
355 }
356#ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
358 {
359 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
360 Idx i;
361
362# ifdef _LIBC
363 /* See if we have to try all bytes which start multiple collation
364 elements.
365 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 collation element, and don't catch 'b' since 'b' is
367 the only collation element which starts from 'b' (and
368 it is caught by SIMPLE_BRACKET). */
369 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370 && (cset->ncoll_syms || cset->nranges))
371 {
372 const int32_t *table = (const int32_t *)
373 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 for (i = 0; i < SBC_MAX; ++i)
375 if (table[i] < 0)
376 re_set_fastmap (fastmap, icase, i);
377 }
378# endif /* _LIBC */
379
380 /* See if we have to start the match at all multibyte characters,
381 i.e. where we would not find an invalid sequence. This only
382 applies to multibyte character sets; for single byte character
383 sets, the SIMPLE_BRACKET again suffices. */
384 if (dfa->mb_cur_max > 1
385 && (cset->nchar_classes || cset->non_match || cset->nranges
386# ifdef _LIBC
387 || cset->nequiv_classes
388# endif /* _LIBC */
389 ))
390 {
391 unsigned char c = 0;
392 do
393 {
394 mbstate_t mbs;
395 memset (&mbs, 0, sizeof (mbs));
396 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397 re_set_fastmap (fastmap, false, (int) c);
398 }
399 while (++c != 0);
400 }
401
402 else
403 {
404 /* ... Else catch all bytes which can start the mbchars. */
405 for (i = 0; i < cset->nmbchars; ++i)
406 {
407 char buf[256];
408 mbstate_t state;
409 memset (&state, '\0', sizeof (state));
410 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 {
414 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415 != (size_t) -1)
416 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
417 }
418 }
419 }
420 }
421#endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423#ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425#endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
427 {
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
431 return;
432 }
433 }
434}
435\f
436/* Entry point for POSIX code. */
437/* regcomp takes a regular expression as a string and compiles it.
438
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
441
442 'buffer' to the compiled pattern;
443 'used' to the length of the compiled pattern;
444 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 'fastmap' to an allocated space for the fastmap;
449 'fastmap_accurate' to zero;
450 're_nsub' to the number of subexpressions in PATTERN.
451
452 PATTERN is the address of the pattern string.
453
454 CFLAGS is a series of bits which affect compilation.
455
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
458
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
461
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
464
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
467 registers.
468
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
471
472int
473regcomp (preg, pattern, cflags)
474 regex_t *_Restrict_ preg;
475 const char *_Restrict_ pattern;
476 int cflags;
477{
478 reg_errcode_t ret;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
481
482 preg->buffer = NULL;
483 preg->allocated = 0;
484 preg->used = 0;
485
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
489 return REG_ESPACE;
490
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
500 }
501 else
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
505
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
511 ret = REG_EPAREN;
512
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
518 else
519 {
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
523 }
524
525 return (int) ret;
526}
527#ifdef _LIBC
528weak_alias (__regcomp, regcomp)
529#endif
530
531/* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
533
534#ifdef _LIBC
535size_t
536regerror (errcode, preg, errbuf, errbuf_size)
537 int errcode;
538 const regex_t *_Restrict_ preg;
539 char *_Restrict_ errbuf;
540 size_t errbuf_size;
541#else /* size_t might promote */
542size_t
543regerror (int errcode, const regex_t *_Restrict_ preg,
544 char *_Restrict_ errbuf, size_t errbuf_size)
545#endif
546{
547 const char *msg;
548 size_t msg_size;
549
550 if (BE (errcode < 0
551 || errcode >= (int) (sizeof (__re_error_msgid_idx)
552 / sizeof (__re_error_msgid_idx[0])), 0))
553 /* Only error codes returned by the rest of the code should be passed
554 to this routine. If we are given anything else, or if other regex
555 code generates an invalid error code, then the program has a bug.
556 Dump core so we can fix it. */
557 abort ();
558
559 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560
561 msg_size = strlen (msg) + 1; /* Includes the null. */
562
563 if (BE (errbuf_size != 0, 1))
564 {
565 size_t cpy_size = msg_size;
566 if (BE (msg_size > errbuf_size, 0))
567 {
568 cpy_size = errbuf_size - 1;
569 errbuf[cpy_size] = '\0';
570 }
571 memcpy (errbuf, msg, cpy_size);
572 }
573
574 return msg_size;
575}
576#ifdef _LIBC
577weak_alias (__regerror, regerror)
578#endif
579
580
581#ifdef RE_ENABLE_I18N
582/* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586static const bitset_t utf8_sb_map =
587{
588 /* Set the first 128 bits. */
589# if 4 * BITSET_WORD_BITS < ASCII_CHARS
590# error "bitset_word_t is narrower than 32 bits"
591# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
592 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
593# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
594 BITSET_WORD_MAX, BITSET_WORD_MAX,
595# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
596 BITSET_WORD_MAX,
597# endif
598 (BITSET_WORD_MAX
599 >> (SBC_MAX % BITSET_WORD_BITS == 0
600 ? 0
601 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
602};
603#endif
604
605
606static void
607free_dfa_content (re_dfa_t *dfa)
608{
609 Idx i, j;
610
611 if (dfa->nodes)
612 for (i = 0; i < dfa->nodes_len; ++i)
613 free_token (dfa->nodes + i);
614 re_free (dfa->nexts);
615 for (i = 0; i < dfa->nodes_len; ++i)
616 {
617 if (dfa->eclosures != NULL)
618 re_node_set_free (dfa->eclosures + i);
619 if (dfa->inveclosures != NULL)
620 re_node_set_free (dfa->inveclosures + i);
621 if (dfa->edests != NULL)
622 re_node_set_free (dfa->edests + i);
623 }
624 re_free (dfa->edests);
625 re_free (dfa->eclosures);
626 re_free (dfa->inveclosures);
627 re_free (dfa->nodes);
628
629 if (dfa->state_table)
630 for (i = 0; i <= dfa->state_hash_mask; ++i)
631 {
632 struct re_state_table_entry *entry = dfa->state_table + i;
633 for (j = 0; j < entry->num; ++j)
634 {
635 re_dfastate_t *state = entry->array[j];
636 free_state (state);
637 }
638 re_free (entry->array);
639 }
640 re_free (dfa->state_table);
641#ifdef RE_ENABLE_I18N
642 if (dfa->sb_char != utf8_sb_map)
643 re_free (dfa->sb_char);
644#endif
645 re_free (dfa->subexp_map);
646#ifdef DEBUG
647 re_free (dfa->re_str);
648#endif
649
650 re_free (dfa);
651}
652
653
654/* Free dynamically allocated space used by PREG. */
655
656void
657regfree (preg)
658 regex_t *preg;
659{
660 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
661 if (BE (dfa != NULL, 1))
662 free_dfa_content (dfa);
663 preg->buffer = NULL;
664 preg->allocated = 0;
665
666 re_free (preg->fastmap);
667 preg->fastmap = NULL;
668
669 re_free (preg->translate);
670 preg->translate = NULL;
671}
672#ifdef _LIBC
673weak_alias (__regfree, regfree)
674#endif
675\f
676/* Entry points compatible with 4.2 BSD regex library. We don't define
677 them unless specifically requested. */
678
679#if defined _REGEX_RE_COMP || defined _LIBC
680
681/* BSD has one and only one pattern buffer. */
682static struct re_pattern_buffer re_comp_buf;
683
684char *
685# ifdef _LIBC
686/* Make these definitions weak in libc, so POSIX programs can redefine
687 these names if they don't use our functions, and still use
688 regcomp/regexec above without link errors. */
689weak_function
690# endif
691re_comp (s)
692 const char *s;
693{
694 reg_errcode_t ret;
695 char *fastmap;
696
697 if (!s)
698 {
699 if (!re_comp_buf.buffer)
700 return gettext ("No previous regular expression");
701 return 0;
702 }
703
704 if (re_comp_buf.buffer)
705 {
706 fastmap = re_comp_buf.fastmap;
707 re_comp_buf.fastmap = NULL;
708 __regfree (&re_comp_buf);
709 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
710 re_comp_buf.fastmap = fastmap;
711 }
712
713 if (re_comp_buf.fastmap == NULL)
714 {
715 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
716 if (re_comp_buf.fastmap == NULL)
717 return (char *) gettext (__re_error_msgid
718 + __re_error_msgid_idx[(int) REG_ESPACE]);
719 }
720
721 /* Since 're_exec' always passes NULL for the 'regs' argument, we
722 don't need to initialize the pattern buffer fields which affect it. */
723
724 /* Match anchors at newlines. */
725 re_comp_buf.newline_anchor = 1;
726
727 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
728
729 if (!ret)
730 return NULL;
731
732 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
733 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
734}
735
736#ifdef _LIBC
737libc_freeres_fn (free_mem)
738{
739 __regfree (&re_comp_buf);
740}
741#endif
742
743#endif /* _REGEX_RE_COMP */
744\f
745/* Internal entry point.
746 Compile the regular expression PATTERN, whose length is LENGTH.
747 SYNTAX indicate regular expression's syntax. */
748
749static reg_errcode_t
750re_compile_internal (regex_t *preg, const char * pattern, size_t length,
751 reg_syntax_t syntax)
752{
753 reg_errcode_t err = REG_NOERROR;
754 re_dfa_t *dfa;
755 re_string_t regexp;
756
757 /* Initialize the pattern buffer. */
758 preg->fastmap_accurate = 0;
759 preg->syntax = syntax;
760 preg->not_bol = preg->not_eol = 0;
761 preg->used = 0;
762 preg->re_nsub = 0;
763 preg->can_be_null = 0;
764 preg->regs_allocated = REGS_UNALLOCATED;
765
766 /* Initialize the dfa. */
767 dfa = (re_dfa_t *) preg->buffer;
768 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
769 {
770 /* If zero allocated, but buffer is non-null, try to realloc
771 enough space. This loses if buffer's address is bogus, but
772 that is the user's responsibility. If ->buffer is NULL this
773 is a simple allocation. */
774 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
775 if (dfa == NULL)
776 return REG_ESPACE;
777 preg->allocated = sizeof (re_dfa_t);
778 preg->buffer = (unsigned char *) dfa;
779 }
780 preg->used = sizeof (re_dfa_t);
781
782 err = init_dfa (dfa, length);
783 if (BE (err != REG_NOERROR, 0))
784 {
785 free_dfa_content (dfa);
786 preg->buffer = NULL;
787 preg->allocated = 0;
788 return err;
789 }
790#ifdef DEBUG
791 /* Note: length+1 will not overflow since it is checked in init_dfa. */
792 dfa->re_str = re_malloc (char, length + 1);
793 strncpy (dfa->re_str, pattern, length + 1);
794#endif
795
796 __libc_lock_init (dfa->lock);
797
798 err = re_string_construct (&regexp, pattern, length, preg->translate,
799 (syntax & RE_ICASE) != 0, dfa);
800 if (BE (err != REG_NOERROR, 0))
801 {
802 re_compile_internal_free_return:
803 free_workarea_compile (preg);
804 re_string_destruct (&regexp);
805 free_dfa_content (dfa);
806 preg->buffer = NULL;
807 preg->allocated = 0;
808 return err;
809 }
810
811 /* Parse the regular expression, and build a structure tree. */
812 preg->re_nsub = 0;
813 dfa->str_tree = parse (&regexp, preg, syntax, &err);
814 if (BE (dfa->str_tree == NULL, 0))
815 goto re_compile_internal_free_return;
816
817 /* Analyze the tree and create the nfa. */
818 err = analyze (preg);
819 if (BE (err != REG_NOERROR, 0))
820 goto re_compile_internal_free_return;
821
822#ifdef RE_ENABLE_I18N
823 /* If possible, do searching in single byte encoding to speed things up. */
824 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
825 optimize_utf8 (dfa);
826#endif
827
828 /* Then create the initial state of the dfa. */
829 err = create_initial_state (dfa);
830
831 /* Release work areas. */
832 free_workarea_compile (preg);
833 re_string_destruct (&regexp);
834
835 if (BE (err != REG_NOERROR, 0))
836 {
837 free_dfa_content (dfa);
838 preg->buffer = NULL;
839 preg->allocated = 0;
840 }
841
842 return err;
843}
844
845/* Initialize DFA. We use the length of the regular expression PAT_LEN
846 as the initial length of some arrays. */
847
848static reg_errcode_t
849init_dfa (re_dfa_t *dfa, size_t pat_len)
850{
851 __re_size_t table_size;
852#ifndef _LIBC
853 char *codeset_name;
854#endif
855#ifdef RE_ENABLE_I18N
856 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
857#else
858 size_t max_i18n_object_size = 0;
859#endif
860 size_t max_object_size =
861 MAX (sizeof (struct re_state_table_entry),
862 MAX (sizeof (re_token_t),
863 MAX (sizeof (re_node_set),
864 MAX (sizeof (regmatch_t),
865 max_i18n_object_size))));
866
867 memset (dfa, '\0', sizeof (re_dfa_t));
868
869 /* Force allocation of str_tree_storage the first time. */
870 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
871
872 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
873 calculation below, and for similar doubling calculations
874 elsewhere. And it's <= rather than <, because some of the
875 doubling calculations add 1 afterwards. */
876 if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
877 return REG_ESPACE;
878
879 dfa->nodes_alloc = pat_len + 1;
880 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
881
882 /* table_size = 2 ^ ceil(log pat_len) */
883 for (table_size = 1; ; table_size <<= 1)
884 if (table_size > pat_len)
885 break;
886
887 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
888 dfa->state_hash_mask = table_size - 1;
889
890 dfa->mb_cur_max = MB_CUR_MAX;
891#ifdef _LIBC
892 if (dfa->mb_cur_max == 6
893 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
894 dfa->is_utf8 = 1;
895 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
896 != 0);
897#else
898 codeset_name = nl_langinfo (CODESET);
899 if (strcasecmp (codeset_name, "UTF-8") == 0
900 || strcasecmp (codeset_name, "UTF8") == 0)
901 dfa->is_utf8 = 1;
902
903 /* We check exhaustively in the loop below if this charset is a
904 superset of ASCII. */
905 dfa->map_notascii = 0;
906#endif
907
908#ifdef RE_ENABLE_I18N
909 if (dfa->mb_cur_max > 1)
910 {
911 if (dfa->is_utf8)
912 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
913 else
914 {
915 int i, j, ch;
916
917 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
918 if (BE (dfa->sb_char == NULL, 0))
919 return REG_ESPACE;
920
921 /* Set the bits corresponding to single byte chars. */
922 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
923 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
924 {
925 wint_t wch = __btowc (ch);
926 if (wch != WEOF)
927 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
928# ifndef _LIBC
929 if (isascii (ch) && wch != ch)
930 dfa->map_notascii = 1;
931# endif
932 }
933 }
934 }
935#endif
936
937 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
938 return REG_ESPACE;
939 return REG_NOERROR;
940}
941
942/* Initialize WORD_CHAR table, which indicate which character is
943 "word". In this case "word" means that it is the word construction
944 character used by some operators like "\<", "\>", etc. */
945
946static void
947internal_function
948init_word_char (re_dfa_t *dfa)
949{
950 int i, j, ch;
951 dfa->word_ops_used = 1;
952 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
953 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
954 if (isalnum (ch) || ch == '_')
955 dfa->word_char[i] |= (bitset_word_t) 1 << j;
956}
957
958/* Free the work area which are only used while compiling. */
959
960static void
961free_workarea_compile (regex_t *preg)
962{
963 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
964 bin_tree_storage_t *storage, *next;
965 for (storage = dfa->str_tree_storage; storage; storage = next)
966 {
967 next = storage->next;
968 re_free (storage);
969 }
970 dfa->str_tree_storage = NULL;
971 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
972 dfa->str_tree = NULL;
973 re_free (dfa->org_indices);
974 dfa->org_indices = NULL;
975}
976
977/* Create initial states for all contexts. */
978
979static reg_errcode_t
980create_initial_state (re_dfa_t *dfa)
981{
982 Idx first, i;
983 reg_errcode_t err;
984 re_node_set init_nodes;
985
986 /* Initial states have the epsilon closure of the node which is
987 the first node of the regular expression. */
988 first = dfa->str_tree->first->node_idx;
989 dfa->init_node = first;
990 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
991 if (BE (err != REG_NOERROR, 0))
992 return err;
993
994 /* The back-references which are in initial states can epsilon transit,
995 since in this case all of the subexpressions can be null.
996 Then we add epsilon closures of the nodes which are the next nodes of
997 the back-references. */
998 if (dfa->nbackref > 0)
999 for (i = 0; i < init_nodes.nelem; ++i)
1000 {
1001 Idx node_idx = init_nodes.elems[i];
1002 re_token_type_t type = dfa->nodes[node_idx].type;
1003
1004 Idx clexp_idx;
1005 if (type != OP_BACK_REF)
1006 continue;
1007 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1008 {
1009 re_token_t *clexp_node;
1010 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1011 if (clexp_node->type == OP_CLOSE_SUBEXP
1012 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1013 break;
1014 }
1015 if (clexp_idx == init_nodes.nelem)
1016 continue;
1017
1018 if (type == OP_BACK_REF)
1019 {
1020 Idx dest_idx = dfa->edests[node_idx].elems[0];
1021 if (!re_node_set_contains (&init_nodes, dest_idx))
1022 {
1023 reg_errcode_t merge_err
1024 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1025 if (merge_err != REG_NOERROR)
1026 return merge_err;
1027 i = 0;
1028 }
1029 }
1030 }
1031
1032 /* It must be the first time to invoke acquire_state. */
1033 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1034 /* We don't check ERR here, since the initial state must not be NULL. */
1035 if (BE (dfa->init_state == NULL, 0))
1036 return err;
1037 if (dfa->init_state->has_constraint)
1038 {
1039 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1040 CONTEXT_WORD);
1041 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1042 CONTEXT_NEWLINE);
1043 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1044 &init_nodes,
1045 CONTEXT_NEWLINE
1046 | CONTEXT_BEGBUF);
1047 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1048 || dfa->init_state_begbuf == NULL, 0))
1049 return err;
1050 }
1051 else
1052 dfa->init_state_word = dfa->init_state_nl
1053 = dfa->init_state_begbuf = dfa->init_state;
1054
1055 re_node_set_free (&init_nodes);
1056 return REG_NOERROR;
1057}
1058\f
1059#ifdef RE_ENABLE_I18N
1060/* If it is possible to do searching in single byte encoding instead of UTF-8
1061 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1062 DFA nodes where needed. */
1063
1064static void
1065optimize_utf8 (re_dfa_t *dfa)
1066{
1067 Idx node;
1068 int i;
1069 bool mb_chars = false;
1070 bool has_period = false;
1071
1072 for (node = 0; node < dfa->nodes_len; ++node)
1073 switch (dfa->nodes[node].type)
1074 {
1075 case CHARACTER:
1076 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1077 mb_chars = true;
1078 break;
1079 case ANCHOR:
1080 switch (dfa->nodes[node].opr.ctx_type)
1081 {
1082 case LINE_FIRST:
1083 case LINE_LAST:
1084 case BUF_FIRST:
1085 case BUF_LAST:
1086 break;
1087 default:
1088 /* Word anchors etc. cannot be handled. It's okay to test
1089 opr.ctx_type since constraints (for all DFA nodes) are
1090 created by ORing one or more opr.ctx_type values. */
1091 return;
1092 }
1093 break;
1094 case OP_PERIOD:
1095 has_period = true;
1096 break;
1097 case OP_BACK_REF:
1098 case OP_ALT:
1099 case END_OF_RE:
1100 case OP_DUP_ASTERISK:
1101 case OP_OPEN_SUBEXP:
1102 case OP_CLOSE_SUBEXP:
1103 break;
1104 case COMPLEX_BRACKET:
1105 return;
1106 case SIMPLE_BRACKET:
1107 /* Just double check. */
1108 {
1109 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1110 ? 0
1111 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1112 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1113 {
1114 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1115 return;
1116 rshift = 0;
1117 }
1118 }
1119 break;
1120 default:
1121 abort ();
1122 }
1123
1124 if (mb_chars || has_period)
1125 for (node = 0; node < dfa->nodes_len; ++node)
1126 {
1127 if (dfa->nodes[node].type == CHARACTER
1128 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1129 dfa->nodes[node].mb_partial = 0;
1130 else if (dfa->nodes[node].type == OP_PERIOD)
1131 dfa->nodes[node].type = OP_UTF8_PERIOD;
1132 }
1133
1134 /* The search can be in single byte locale. */
1135 dfa->mb_cur_max = 1;
1136 dfa->is_utf8 = 0;
1137 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1138}
1139#endif
1140\f
1141/* Analyze the structure tree, and calculate "first", "next", "edest",
1142 "eclosure", and "inveclosure". */
1143
1144static reg_errcode_t
1145analyze (regex_t *preg)
1146{
1147 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1148 reg_errcode_t ret;
1149
1150 /* Allocate arrays. */
1151 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1152 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1153 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1154 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1155 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1156 || dfa->eclosures == NULL, 0))
1157 return REG_ESPACE;
1158
1159 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1160 if (dfa->subexp_map != NULL)
1161 {
1162 Idx i;
1163 for (i = 0; i < preg->re_nsub; i++)
1164 dfa->subexp_map[i] = i;
1165 preorder (dfa->str_tree, optimize_subexps, dfa);
1166 for (i = 0; i < preg->re_nsub; i++)
1167 if (dfa->subexp_map[i] != i)
1168 break;
1169 if (i == preg->re_nsub)
1170 {
1171 free (dfa->subexp_map);
1172 dfa->subexp_map = NULL;
1173 }
1174 }
1175
1176 ret = postorder (dfa->str_tree, lower_subexps, preg);
1177 if (BE (ret != REG_NOERROR, 0))
1178 return ret;
1179 ret = postorder (dfa->str_tree, calc_first, dfa);
1180 if (BE (ret != REG_NOERROR, 0))
1181 return ret;
1182 preorder (dfa->str_tree, calc_next, dfa);
1183 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1184 if (BE (ret != REG_NOERROR, 0))
1185 return ret;
1186 ret = calc_eclosure (dfa);
1187 if (BE (ret != REG_NOERROR, 0))
1188 return ret;
1189
1190 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1191 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1192 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1193 || dfa->nbackref)
1194 {
1195 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1196 if (BE (dfa->inveclosures == NULL, 0))
1197 return REG_ESPACE;
1198 ret = calc_inveclosure (dfa);
1199 }
1200
1201 return ret;
1202}
1203
1204/* Our parse trees are very unbalanced, so we cannot use a stack to
1205 implement parse tree visits. Instead, we use parent pointers and
1206 some hairy code in these two functions. */
1207static reg_errcode_t
1208postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1209 void *extra)
1210{
1211 bin_tree_t *node, *prev;
1212
1213 for (node = root; ; )
1214 {
1215 /* Descend down the tree, preferably to the left (or to the right
1216 if that's the only child). */
1217 while (node->left || node->right)
1218 if (node->left)
1219 node = node->left;
1220 else
1221 node = node->right;
1222
1223 do
1224 {
1225 reg_errcode_t err = fn (extra, node);
1226 if (BE (err != REG_NOERROR, 0))
1227 return err;
1228 if (node->parent == NULL)
1229 return REG_NOERROR;
1230 prev = node;
1231 node = node->parent;
1232 }
1233 /* Go up while we have a node that is reached from the right. */
1234 while (node->right == prev || node->right == NULL);
1235 node = node->right;
1236 }
1237}
1238
1239static reg_errcode_t
1240preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1241 void *extra)
1242{
1243 bin_tree_t *node;
1244
1245 for (node = root; ; )
1246 {
1247 reg_errcode_t err = fn (extra, node);
1248 if (BE (err != REG_NOERROR, 0))
1249 return err;
1250
1251 /* Go to the left node, or up and to the right. */
1252 if (node->left)
1253 node = node->left;
1254 else
1255 {
1256 bin_tree_t *prev = NULL;
1257 while (node->right == prev || node->right == NULL)
1258 {
1259 prev = node;
1260 node = node->parent;
1261 if (!node)
1262 return REG_NOERROR;
1263 }
1264 node = node->right;
1265 }
1266 }
1267}
1268
1269/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1270 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1271 backreferences as well. Requires a preorder visit. */
1272static reg_errcode_t
1273optimize_subexps (void *extra, bin_tree_t *node)
1274{
1275 re_dfa_t *dfa = (re_dfa_t *) extra;
1276
1277 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1278 {
1279 int idx = node->token.opr.idx;
1280 node->token.opr.idx = dfa->subexp_map[idx];
1281 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1282 }
1283
1284 else if (node->token.type == SUBEXP
1285 && node->left && node->left->token.type == SUBEXP)
1286 {
1287 Idx other_idx = node->left->token.opr.idx;
1288
1289 node->left = node->left->left;
1290 if (node->left)
1291 node->left->parent = node;
1292
1293 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1294 if (other_idx < BITSET_WORD_BITS)
1295 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1296 }
1297
1298 return REG_NOERROR;
1299}
1300
1301/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1302 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1303static reg_errcode_t
1304lower_subexps (void *extra, bin_tree_t *node)
1305{
1306 regex_t *preg = (regex_t *) extra;
1307 reg_errcode_t err = REG_NOERROR;
1308
1309 if (node->left && node->left->token.type == SUBEXP)
1310 {
1311 node->left = lower_subexp (&err, preg, node->left);
1312 if (node->left)
1313 node->left->parent = node;
1314 }
1315 if (node->right && node->right->token.type == SUBEXP)
1316 {
1317 node->right = lower_subexp (&err, preg, node->right);
1318 if (node->right)
1319 node->right->parent = node;
1320 }
1321
1322 return err;
1323}
1324
1325static bin_tree_t *
1326lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1327{
1328 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1329 bin_tree_t *body = node->left;
1330 bin_tree_t *op, *cls, *tree1, *tree;
1331
1332 if (preg->no_sub
1333 /* We do not optimize empty subexpressions, because otherwise we may
1334 have bad CONCAT nodes with NULL children. This is obviously not
1335 very common, so we do not lose much. An example that triggers
1336 this case is the sed "script" /\(\)/x. */
1337 && node->left != NULL
1338 && (node->token.opr.idx >= BITSET_WORD_BITS
1339 || !(dfa->used_bkref_map
1340 & ((bitset_word_t) 1 << node->token.opr.idx))))
1341 return node->left;
1342
1343 /* Convert the SUBEXP node to the concatenation of an
1344 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1345 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1346 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1347 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1348 tree = create_tree (dfa, op, tree1, CONCAT);
1349 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1350 {
1351 *err = REG_ESPACE;
1352 return NULL;
1353 }
1354
1355 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1356 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1357 return tree;
1358}
1359
1360/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1361 nodes. Requires a postorder visit. */
1362static reg_errcode_t
1363calc_first (void *extra, bin_tree_t *node)
1364{
1365 re_dfa_t *dfa = (re_dfa_t *) extra;
1366 if (node->token.type == CONCAT)
1367 {
1368 node->first = node->left->first;
1369 node->node_idx = node->left->node_idx;
1370 }
1371 else
1372 {
1373 node->first = node;
1374 node->node_idx = re_dfa_add_node (dfa, node->token);
1375 if (BE (node->node_idx == REG_MISSING, 0))
1376 return REG_ESPACE;
1377 if (node->token.type == ANCHOR)
1378 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1379 }
1380 return REG_NOERROR;
1381}
1382
1383/* Pass 2: compute NEXT on the tree. Preorder visit. */
1384static reg_errcode_t
1385calc_next (void *extra, bin_tree_t *node)
1386{
1387 switch (node->token.type)
1388 {
1389 case OP_DUP_ASTERISK:
1390 node->left->next = node;
1391 break;
1392 case CONCAT:
1393 node->left->next = node->right->first;
1394 node->right->next = node->next;
1395 break;
1396 default:
1397 if (node->left)
1398 node->left->next = node->next;
1399 if (node->right)
1400 node->right->next = node->next;
1401 break;
1402 }
1403 return REG_NOERROR;
1404}
1405
1406/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1407static reg_errcode_t
1408link_nfa_nodes (void *extra, bin_tree_t *node)
1409{
1410 re_dfa_t *dfa = (re_dfa_t *) extra;
1411 Idx idx = node->node_idx;
1412 reg_errcode_t err = REG_NOERROR;
1413
1414 switch (node->token.type)
1415 {
1416 case CONCAT:
1417 break;
1418
1419 case END_OF_RE:
1420 assert (node->next == NULL);
1421 break;
1422
1423 case OP_DUP_ASTERISK:
1424 case OP_ALT:
1425 {
1426 Idx left, right;
1427 dfa->has_plural_match = 1;
1428 if (node->left != NULL)
1429 left = node->left->first->node_idx;
1430 else
1431 left = node->next->node_idx;
1432 if (node->right != NULL)
1433 right = node->right->first->node_idx;
1434 else
1435 right = node->next->node_idx;
1436 assert (REG_VALID_INDEX (left));
1437 assert (REG_VALID_INDEX (right));
1438 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1439 }
1440 break;
1441
1442 case ANCHOR:
1443 case OP_OPEN_SUBEXP:
1444 case OP_CLOSE_SUBEXP:
1445 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1446 break;
1447
1448 case OP_BACK_REF:
1449 dfa->nexts[idx] = node->next->node_idx;
1450 if (node->token.type == OP_BACK_REF)
1451 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1452 break;
1453
1454 default:
1455 assert (!IS_EPSILON_NODE (node->token.type));
1456 dfa->nexts[idx] = node->next->node_idx;
1457 break;
1458 }
1459
1460 return err;
1461}
1462
1463/* Duplicate the epsilon closure of the node ROOT_NODE.
1464 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1465 to their own constraint. */
1466
1467static reg_errcode_t
1468internal_function
1469duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1470 Idx root_node, unsigned int init_constraint)
1471{
1472 Idx org_node, clone_node;
1473 bool ok;
1474 unsigned int constraint = init_constraint;
1475 for (org_node = top_org_node, clone_node = top_clone_node;;)
1476 {
1477 Idx org_dest, clone_dest;
1478 if (dfa->nodes[org_node].type == OP_BACK_REF)
1479 {
1480 /* If the back reference epsilon-transit, its destination must
1481 also have the constraint. Then duplicate the epsilon closure
1482 of the destination of the back reference, and store it in
1483 edests of the back reference. */
1484 org_dest = dfa->nexts[org_node];
1485 re_node_set_empty (dfa->edests + clone_node);
1486 clone_dest = duplicate_node (dfa, org_dest, constraint);
1487 if (BE (clone_dest == REG_MISSING, 0))
1488 return REG_ESPACE;
1489 dfa->nexts[clone_node] = dfa->nexts[org_node];
1490 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1491 if (BE (! ok, 0))
1492 return REG_ESPACE;
1493 }
1494 else if (dfa->edests[org_node].nelem == 0)
1495 {
1496 /* In case of the node can't epsilon-transit, don't duplicate the
1497 destination and store the original destination as the
1498 destination of the node. */
1499 dfa->nexts[clone_node] = dfa->nexts[org_node];
1500 break;
1501 }
1502 else if (dfa->edests[org_node].nelem == 1)
1503 {
1504 /* In case of the node can epsilon-transit, and it has only one
1505 destination. */
1506 org_dest = dfa->edests[org_node].elems[0];
1507 re_node_set_empty (dfa->edests + clone_node);
1508 /* If the node is root_node itself, it means the epsilon closure
1509 has a loop. Then tie it to the destination of the root_node. */
1510 if (org_node == root_node && clone_node != org_node)
1511 {
1512 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1513 if (BE (! ok, 0))
1514 return REG_ESPACE;
1515 break;
1516 }
1517 /* In case the node has another constraint, append it. */
1518 constraint |= dfa->nodes[org_node].constraint;
1519 clone_dest = duplicate_node (dfa, org_dest, constraint);
1520 if (BE (clone_dest == REG_MISSING, 0))
1521 return REG_ESPACE;
1522 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1523 if (BE (! ok, 0))
1524 return REG_ESPACE;
1525 }
1526 else /* dfa->edests[org_node].nelem == 2 */
1527 {
1528 /* In case of the node can epsilon-transit, and it has two
1529 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1530 org_dest = dfa->edests[org_node].elems[0];
1531 re_node_set_empty (dfa->edests + clone_node);
1532 /* Search for a duplicated node which satisfies the constraint. */
1533 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1534 if (clone_dest == REG_MISSING)
1535 {
1536 /* There is no such duplicated node, create a new one. */
1537 reg_errcode_t err;
1538 clone_dest = duplicate_node (dfa, org_dest, constraint);
1539 if (BE (clone_dest == REG_MISSING, 0))
1540 return REG_ESPACE;
1541 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542 if (BE (! ok, 0))
1543 return REG_ESPACE;
1544 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545 root_node, constraint);
1546 if (BE (err != REG_NOERROR, 0))
1547 return err;
1548 }
1549 else
1550 {
1551 /* There is a duplicated node which satisfies the constraint,
1552 use it to avoid infinite loop. */
1553 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554 if (BE (! ok, 0))
1555 return REG_ESPACE;
1556 }
1557
1558 org_dest = dfa->edests[org_node].elems[1];
1559 clone_dest = duplicate_node (dfa, org_dest, constraint);
1560 if (BE (clone_dest == REG_MISSING, 0))
1561 return REG_ESPACE;
1562 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563 if (BE (! ok, 0))
1564 return REG_ESPACE;
1565 }
1566 org_node = org_dest;
1567 clone_node = clone_dest;
1568 }
1569 return REG_NOERROR;
1570}
1571
1572/* Search for a node which is duplicated from the node ORG_NODE, and
1573 satisfies the constraint CONSTRAINT. */
1574
1575static Idx
1576search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1577 unsigned int constraint)
1578{
1579 Idx idx;
1580 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1581 {
1582 if (org_node == dfa->org_indices[idx]
1583 && constraint == dfa->nodes[idx].constraint)
1584 return idx; /* Found. */
1585 }
1586 return REG_MISSING; /* Not found. */
1587}
1588
1589/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1590 Return the index of the new node, or REG_MISSING if insufficient storage is
1591 available. */
1592
1593static Idx
1594duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1595{
1596 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1597 if (BE (dup_idx != REG_MISSING, 1))
1598 {
1599 dfa->nodes[dup_idx].constraint = constraint;
1600 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1601 dfa->nodes[dup_idx].duplicated = 1;
1602
1603 /* Store the index of the original node. */
1604 dfa->org_indices[dup_idx] = org_idx;
1605 }
1606 return dup_idx;
1607}
1608
1609static reg_errcode_t
1610calc_inveclosure (re_dfa_t *dfa)
1611{
1612 Idx src, idx;
1613 bool ok;
1614 for (idx = 0; idx < dfa->nodes_len; ++idx)
1615 re_node_set_init_empty (dfa->inveclosures + idx);
1616
1617 for (src = 0; src < dfa->nodes_len; ++src)
1618 {
1619 Idx *elems = dfa->eclosures[src].elems;
1620 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1621 {
1622 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1623 if (BE (! ok, 0))
1624 return REG_ESPACE;
1625 }
1626 }
1627
1628 return REG_NOERROR;
1629}
1630
1631/* Calculate "eclosure" for all the node in DFA. */
1632
1633static reg_errcode_t
1634calc_eclosure (re_dfa_t *dfa)
1635{
1636 Idx node_idx;
1637 bool incomplete;
1638#ifdef DEBUG
1639 assert (dfa->nodes_len > 0);
1640#endif
1641 incomplete = false;
1642 /* For each nodes, calculate epsilon closure. */
1643 for (node_idx = 0; ; ++node_idx)
1644 {
1645 reg_errcode_t err;
1646 re_node_set eclosure_elem;
1647 if (node_idx == dfa->nodes_len)
1648 {
1649 if (!incomplete)
1650 break;
1651 incomplete = false;
1652 node_idx = 0;
1653 }
1654
1655#ifdef DEBUG
1656 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1657#endif
1658
1659 /* If we have already calculated, skip it. */
1660 if (dfa->eclosures[node_idx].nelem != 0)
1661 continue;
1662 /* Calculate epsilon closure of 'node_idx'. */
1663 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1664 if (BE (err != REG_NOERROR, 0))
1665 return err;
1666
1667 if (dfa->eclosures[node_idx].nelem == 0)
1668 {
1669 incomplete = true;
1670 re_node_set_free (&eclosure_elem);
1671 }
1672 }
1673 return REG_NOERROR;
1674}
1675
1676/* Calculate epsilon closure of NODE. */
1677
1678static reg_errcode_t
1679calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1680{
1681 reg_errcode_t err;
1682 Idx i;
1683 re_node_set eclosure;
1684 bool ok;
1685 bool incomplete = false;
1686 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1687 if (BE (err != REG_NOERROR, 0))
1688 return err;
1689
1690 /* This indicates that we are calculating this node now.
1691 We reference this value to avoid infinite loop. */
1692 dfa->eclosures[node].nelem = REG_MISSING;
1693
1694 /* If the current node has constraints, duplicate all nodes
1695 since they must inherit the constraints. */
1696 if (dfa->nodes[node].constraint
1697 && dfa->edests[node].nelem
1698 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1699 {
1700 err = duplicate_node_closure (dfa, node, node, node,
1701 dfa->nodes[node].constraint);
1702 if (BE (err != REG_NOERROR, 0))
1703 return err;
1704 }
1705
1706 /* Expand each epsilon destination nodes. */
1707 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1708 for (i = 0; i < dfa->edests[node].nelem; ++i)
1709 {
1710 re_node_set eclosure_elem;
1711 Idx edest = dfa->edests[node].elems[i];
1712 /* If calculating the epsilon closure of 'edest' is in progress,
1713 return intermediate result. */
1714 if (dfa->eclosures[edest].nelem == REG_MISSING)
1715 {
1716 incomplete = true;
1717 continue;
1718 }
1719 /* If we haven't calculated the epsilon closure of 'edest' yet,
1720 calculate now. Otherwise use calculated epsilon closure. */
1721 if (dfa->eclosures[edest].nelem == 0)
1722 {
1723 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1724 if (BE (err != REG_NOERROR, 0))
1725 return err;
1726 }
1727 else
1728 eclosure_elem = dfa->eclosures[edest];
1729 /* Merge the epsilon closure of 'edest'. */
1730 err = re_node_set_merge (&eclosure, &eclosure_elem);
1731 if (BE (err != REG_NOERROR, 0))
1732 return err;
1733 /* If the epsilon closure of 'edest' is incomplete,
1734 the epsilon closure of this node is also incomplete. */
1735 if (dfa->eclosures[edest].nelem == 0)
1736 {
1737 incomplete = true;
1738 re_node_set_free (&eclosure_elem);
1739 }
1740 }
1741
1742 /* An epsilon closure includes itself. */
1743 ok = re_node_set_insert (&eclosure, node);
1744 if (BE (! ok, 0))
1745 return REG_ESPACE;
1746 if (incomplete && !root)
1747 dfa->eclosures[node].nelem = 0;
1748 else
1749 dfa->eclosures[node] = eclosure;
1750 *new_set = eclosure;
1751 return REG_NOERROR;
1752}
1753\f
1754/* Functions for token which are used in the parser. */
1755
1756/* Fetch a token from INPUT.
1757 We must not use this function inside bracket expressions. */
1758
1759static void
1760internal_function
1761fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1762{
1763 re_string_skip_bytes (input, peek_token (result, input, syntax));
1764}
1765
1766/* Peek a token from INPUT, and return the length of the token.
1767 We must not use this function inside bracket expressions. */
1768
1769static int
1770internal_function
1771peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1772{
1773 unsigned char c;
1774
1775 if (re_string_eoi (input))
1776 {
1777 token->type = END_OF_RE;
1778 return 0;
1779 }
1780
1781 c = re_string_peek_byte (input, 0);
1782 token->opr.c = c;
1783
1784 token->word_char = 0;
1785#ifdef RE_ENABLE_I18N
1786 token->mb_partial = 0;
1787 if (input->mb_cur_max > 1 &&
1788 !re_string_first_byte (input, re_string_cur_idx (input)))
1789 {
1790 token->type = CHARACTER;
1791 token->mb_partial = 1;
1792 return 1;
1793 }
1794#endif
1795 if (c == '\\')
1796 {
1797 unsigned char c2;
1798 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1799 {
1800 token->type = BACK_SLASH;
1801 return 1;
1802 }
1803
1804 c2 = re_string_peek_byte_case (input, 1);
1805 token->opr.c = c2;
1806 token->type = CHARACTER;
1807#ifdef RE_ENABLE_I18N
1808 if (input->mb_cur_max > 1)
1809 {
1810 wint_t wc = re_string_wchar_at (input,
1811 re_string_cur_idx (input) + 1);
1812 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1813 }
1814 else
1815#endif
1816 token->word_char = IS_WORD_CHAR (c2) != 0;
1817
1818 switch (c2)
1819 {
1820 case '|':
1821 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1822 token->type = OP_ALT;
1823 break;
1824 case '1': case '2': case '3': case '4': case '5':
1825 case '6': case '7': case '8': case '9':
1826 if (!(syntax & RE_NO_BK_REFS))
1827 {
1828 token->type = OP_BACK_REF;
1829 token->opr.idx = c2 - '1';
1830 }
1831 break;
1832 case '<':
1833 if (!(syntax & RE_NO_GNU_OPS))
1834 {
1835 token->type = ANCHOR;
1836 token->opr.ctx_type = WORD_FIRST;
1837 }
1838 break;
1839 case '>':
1840 if (!(syntax & RE_NO_GNU_OPS))
1841 {
1842 token->type = ANCHOR;
1843 token->opr.ctx_type = WORD_LAST;
1844 }
1845 break;
1846 case 'b':
1847 if (!(syntax & RE_NO_GNU_OPS))
1848 {
1849 token->type = ANCHOR;
1850 token->opr.ctx_type = WORD_DELIM;
1851 }
1852 break;
1853 case 'B':
1854 if (!(syntax & RE_NO_GNU_OPS))
1855 {
1856 token->type = ANCHOR;
1857 token->opr.ctx_type = NOT_WORD_DELIM;
1858 }
1859 break;
1860 case 'w':
1861 if (!(syntax & RE_NO_GNU_OPS))
1862 token->type = OP_WORD;
1863 break;
1864 case 'W':
1865 if (!(syntax & RE_NO_GNU_OPS))
1866 token->type = OP_NOTWORD;
1867 break;
1868 case 's':
1869 if (!(syntax & RE_NO_GNU_OPS))
1870 token->type = OP_SPACE;
1871 break;
1872 case 'S':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 token->type = OP_NOTSPACE;
1875 break;
1876 case '`':
1877 if (!(syntax & RE_NO_GNU_OPS))
1878 {
1879 token->type = ANCHOR;
1880 token->opr.ctx_type = BUF_FIRST;
1881 }
1882 break;
1883 case '\'':
1884 if (!(syntax & RE_NO_GNU_OPS))
1885 {
1886 token->type = ANCHOR;
1887 token->opr.ctx_type = BUF_LAST;
1888 }
1889 break;
1890 case '(':
1891 if (!(syntax & RE_NO_BK_PARENS))
1892 token->type = OP_OPEN_SUBEXP;
1893 break;
1894 case ')':
1895 if (!(syntax & RE_NO_BK_PARENS))
1896 token->type = OP_CLOSE_SUBEXP;
1897 break;
1898 case '+':
1899 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1900 token->type = OP_DUP_PLUS;
1901 break;
1902 case '?':
1903 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1904 token->type = OP_DUP_QUESTION;
1905 break;
1906 case '{':
1907 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1908 token->type = OP_OPEN_DUP_NUM;
1909 break;
1910 case '}':
1911 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1912 token->type = OP_CLOSE_DUP_NUM;
1913 break;
1914 default:
1915 break;
1916 }
1917 return 2;
1918 }
1919
1920 token->type = CHARACTER;
1921#ifdef RE_ENABLE_I18N
1922 if (input->mb_cur_max > 1)
1923 {
1924 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1925 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1926 }
1927 else
1928#endif
1929 token->word_char = IS_WORD_CHAR (token->opr.c);
1930
1931 switch (c)
1932 {
1933 case '\n':
1934 if (syntax & RE_NEWLINE_ALT)
1935 token->type = OP_ALT;
1936 break;
1937 case '|':
1938 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1939 token->type = OP_ALT;
1940 break;
1941 case '*':
1942 token->type = OP_DUP_ASTERISK;
1943 break;
1944 case '+':
1945 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1946 token->type = OP_DUP_PLUS;
1947 break;
1948 case '?':
1949 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1950 token->type = OP_DUP_QUESTION;
1951 break;
1952 case '{':
1953 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1954 token->type = OP_OPEN_DUP_NUM;
1955 break;
1956 case '}':
1957 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1958 token->type = OP_CLOSE_DUP_NUM;
1959 break;
1960 case '(':
1961 if (syntax & RE_NO_BK_PARENS)
1962 token->type = OP_OPEN_SUBEXP;
1963 break;
1964 case ')':
1965 if (syntax & RE_NO_BK_PARENS)
1966 token->type = OP_CLOSE_SUBEXP;
1967 break;
1968 case '[':
1969 token->type = OP_OPEN_BRACKET;
1970 break;
1971 case '.':
1972 token->type = OP_PERIOD;
1973 break;
1974 case '^':
1975 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1976 re_string_cur_idx (input) != 0)
1977 {
1978 char prev = re_string_peek_byte (input, -1);
1979 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1980 break;
1981 }
1982 token->type = ANCHOR;
1983 token->opr.ctx_type = LINE_FIRST;
1984 break;
1985 case '$':
1986 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1987 re_string_cur_idx (input) + 1 != re_string_length (input))
1988 {
1989 re_token_t next;
1990 re_string_skip_bytes (input, 1);
1991 peek_token (&next, input, syntax);
1992 re_string_skip_bytes (input, -1);
1993 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1994 break;
1995 }
1996 token->type = ANCHOR;
1997 token->opr.ctx_type = LINE_LAST;
1998 break;
1999 default:
2000 break;
2001 }
2002 return 1;
2003}
2004
2005/* Peek a token from INPUT, and return the length of the token.
2006 We must not use this function out of bracket expressions. */
2007
2008static int
2009internal_function
2010peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2011{
2012 unsigned char c;
2013 if (re_string_eoi (input))
2014 {
2015 token->type = END_OF_RE;
2016 return 0;
2017 }
2018 c = re_string_peek_byte (input, 0);
2019 token->opr.c = c;
2020
2021#ifdef RE_ENABLE_I18N
2022 if (input->mb_cur_max > 1 &&
2023 !re_string_first_byte (input, re_string_cur_idx (input)))
2024 {
2025 token->type = CHARACTER;
2026 return 1;
2027 }
2028#endif /* RE_ENABLE_I18N */
2029
2030 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2031 && re_string_cur_idx (input) + 1 < re_string_length (input))
2032 {
2033 /* In this case, '\' escape a character. */
2034 unsigned char c2;
2035 re_string_skip_bytes (input, 1);
2036 c2 = re_string_peek_byte (input, 0);
2037 token->opr.c = c2;
2038 token->type = CHARACTER;
2039 return 1;
2040 }
2041 if (c == '[') /* '[' is a special char in a bracket exps. */
2042 {
2043 unsigned char c2;
2044 int token_len;
2045 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2046 c2 = re_string_peek_byte (input, 1);
2047 else
2048 c2 = 0;
2049 token->opr.c = c2;
2050 token_len = 2;
2051 switch (c2)
2052 {
2053 case '.':
2054 token->type = OP_OPEN_COLL_ELEM;
2055 break;
2056 case '=':
2057 token->type = OP_OPEN_EQUIV_CLASS;
2058 break;
2059 case ':':
2060 if (syntax & RE_CHAR_CLASSES)
2061 {
2062 token->type = OP_OPEN_CHAR_CLASS;
2063 break;
2064 }
2065 /* else fall through. */
2066 default:
2067 token->type = CHARACTER;
2068 token->opr.c = c;
2069 token_len = 1;
2070 break;
2071 }
2072 return token_len;
2073 }
2074 switch (c)
2075 {
2076 case '-':
2077 token->type = OP_CHARSET_RANGE;
2078 break;
2079 case ']':
2080 token->type = OP_CLOSE_BRACKET;
2081 break;
2082 case '^':
2083 token->type = OP_NON_MATCH_LIST;
2084 break;
2085 default:
2086 token->type = CHARACTER;
2087 }
2088 return 1;
2089}
2090\f
2091/* Functions for parser. */
2092
2093/* Entry point of the parser.
2094 Parse the regular expression REGEXP and return the structure tree.
2095 If an error is occured, ERR is set by error code, and return NULL.
2096 This function build the following tree, from regular expression <reg_exp>:
2097 CAT
2098 / \
2099 / \
2100 <reg_exp> EOR
2101
2102 CAT means concatenation.
2103 EOR means end of regular expression. */
2104
2105static bin_tree_t *
2106parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2107 reg_errcode_t *err)
2108{
2109 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2110 bin_tree_t *tree, *eor, *root;
2111 re_token_t current_token;
2112 dfa->syntax = syntax;
2113 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2114 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2115 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2116 return NULL;
2117 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2118 if (tree != NULL)
2119 root = create_tree (dfa, tree, eor, CONCAT);
2120 else
2121 root = eor;
2122 if (BE (eor == NULL || root == NULL, 0))
2123 {
2124 *err = REG_ESPACE;
2125 return NULL;
2126 }
2127 return root;
2128}
2129
2130/* This function build the following tree, from regular expression
2131 <branch1>|<branch2>:
2132 ALT
2133 / \
2134 / \
2135 <branch1> <branch2>
2136
2137 ALT means alternative, which represents the operator '|'. */
2138
2139static bin_tree_t *
2140parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2141 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2142{
2143 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2144 bin_tree_t *tree, *branch = NULL;
2145 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2146 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2147 return NULL;
2148
2149 while (token->type == OP_ALT)
2150 {
2151 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2152 if (token->type != OP_ALT && token->type != END_OF_RE
2153 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2154 {
2155 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2156 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2157 return NULL;
2158 }
2159 else
2160 branch = NULL;
2161 tree = create_tree (dfa, tree, branch, OP_ALT);
2162 if (BE (tree == NULL, 0))
2163 {
2164 *err = REG_ESPACE;
2165 return NULL;
2166 }
2167 }
2168 return tree;
2169}
2170
2171/* This function build the following tree, from regular expression
2172 <exp1><exp2>:
2173 CAT
2174 / \
2175 / \
2176 <exp1> <exp2>
2177
2178 CAT means concatenation. */
2179
2180static bin_tree_t *
2181parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2182 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2183{
2184 bin_tree_t *tree, *expr;
2185 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2186 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2187 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2188 return NULL;
2189
2190 while (token->type != OP_ALT && token->type != END_OF_RE
2191 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2192 {
2193 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2194 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2195 {
2196 return NULL;
2197 }
2198 if (tree != NULL && expr != NULL)
2199 {
2200 tree = create_tree (dfa, tree, expr, CONCAT);
2201 if (tree == NULL)
2202 {
2203 *err = REG_ESPACE;
2204 return NULL;
2205 }
2206 }
2207 else if (tree == NULL)
2208 tree = expr;
2209 /* Otherwise expr == NULL, we don't need to create new tree. */
2210 }
2211 return tree;
2212}
2213
2214/* This function build the following tree, from regular expression a*:
2215 *
2216 |
2217 a
2218*/
2219
2220static bin_tree_t *
2221parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2222 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2223{
2224 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2225 bin_tree_t *tree;
2226 switch (token->type)
2227 {
2228 case CHARACTER:
2229 tree = create_token_tree (dfa, NULL, NULL, token);
2230 if (BE (tree == NULL, 0))
2231 {
2232 *err = REG_ESPACE;
2233 return NULL;
2234 }
2235#ifdef RE_ENABLE_I18N
2236 if (dfa->mb_cur_max > 1)
2237 {
2238 while (!re_string_eoi (regexp)
2239 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2240 {
2241 bin_tree_t *mbc_remain;
2242 fetch_token (token, regexp, syntax);
2243 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2244 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2245 if (BE (mbc_remain == NULL || tree == NULL, 0))
2246 {
2247 *err = REG_ESPACE;
2248 return NULL;
2249 }
2250 }
2251 }
2252#endif
2253 break;
2254 case OP_OPEN_SUBEXP:
2255 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2256 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2257 return NULL;
2258 break;
2259 case OP_OPEN_BRACKET:
2260 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2261 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2262 return NULL;
2263 break;
2264 case OP_BACK_REF:
2265 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2266 {
2267 *err = REG_ESUBREG;
2268 return NULL;
2269 }
2270 dfa->used_bkref_map |= 1 << token->opr.idx;
2271 tree = create_token_tree (dfa, NULL, NULL, token);
2272 if (BE (tree == NULL, 0))
2273 {
2274 *err = REG_ESPACE;
2275 return NULL;
2276 }
2277 ++dfa->nbackref;
2278 dfa->has_mb_node = 1;
2279 break;
2280 case OP_OPEN_DUP_NUM:
2281 if (syntax & RE_CONTEXT_INVALID_DUP)
2282 {
2283 *err = REG_BADRPT;
2284 return NULL;
2285 }
2286 /* FALLTHROUGH */
2287 case OP_DUP_ASTERISK:
2288 case OP_DUP_PLUS:
2289 case OP_DUP_QUESTION:
2290 if (syntax & RE_CONTEXT_INVALID_OPS)
2291 {
2292 *err = REG_BADRPT;
2293 return NULL;
2294 }
2295 else if (syntax & RE_CONTEXT_INDEP_OPS)
2296 {
2297 fetch_token (token, regexp, syntax);
2298 return parse_expression (regexp, preg, token, syntax, nest, err);
2299 }
2300 /* else fall through */
2301 case OP_CLOSE_SUBEXP:
2302 if ((token->type == OP_CLOSE_SUBEXP) &&
2303 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2304 {
2305 *err = REG_ERPAREN;
2306 return NULL;
2307 }
2308 /* else fall through */
2309 case OP_CLOSE_DUP_NUM:
2310 /* We treat it as a normal character. */
2311
2312 /* Then we can these characters as normal characters. */
2313 token->type = CHARACTER;
2314 /* mb_partial and word_char bits should be initialized already
2315 by peek_token. */
2316 tree = create_token_tree (dfa, NULL, NULL, token);
2317 if (BE (tree == NULL, 0))
2318 {
2319 *err = REG_ESPACE;
2320 return NULL;
2321 }
2322 break;
2323 case ANCHOR:
2324 if ((token->opr.ctx_type
2325 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2326 && dfa->word_ops_used == 0)
2327 init_word_char (dfa);
2328 if (token->opr.ctx_type == WORD_DELIM
2329 || token->opr.ctx_type == NOT_WORD_DELIM)
2330 {
2331 bin_tree_t *tree_first, *tree_last;
2332 if (token->opr.ctx_type == WORD_DELIM)
2333 {
2334 token->opr.ctx_type = WORD_FIRST;
2335 tree_first = create_token_tree (dfa, NULL, NULL, token);
2336 token->opr.ctx_type = WORD_LAST;
2337 }
2338 else
2339 {
2340 token->opr.ctx_type = INSIDE_WORD;
2341 tree_first = create_token_tree (dfa, NULL, NULL, token);
2342 token->opr.ctx_type = INSIDE_NOTWORD;
2343 }
2344 tree_last = create_token_tree (dfa, NULL, NULL, token);
2345 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2346 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2347 {
2348 *err = REG_ESPACE;
2349 return NULL;
2350 }
2351 }
2352 else
2353 {
2354 tree = create_token_tree (dfa, NULL, NULL, token);
2355 if (BE (tree == NULL, 0))
2356 {
2357 *err = REG_ESPACE;
2358 return NULL;
2359 }
2360 }
2361 /* We must return here, since ANCHORs can't be followed
2362 by repetition operators.
2363 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2364 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2365 fetch_token (token, regexp, syntax);
2366 return tree;
2367 case OP_PERIOD:
2368 tree = create_token_tree (dfa, NULL, NULL, token);
2369 if (BE (tree == NULL, 0))
2370 {
2371 *err = REG_ESPACE;
2372 return NULL;
2373 }
2374 if (dfa->mb_cur_max > 1)
2375 dfa->has_mb_node = 1;
2376 break;
2377 case OP_WORD:
2378 case OP_NOTWORD:
2379 tree = build_charclass_op (dfa, regexp->trans,
2380 (const unsigned char *) "alnum",
2381 (const unsigned char *) "_",
2382 token->type == OP_NOTWORD, err);
2383 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2384 return NULL;
2385 break;
2386 case OP_SPACE:
2387 case OP_NOTSPACE:
2388 tree = build_charclass_op (dfa, regexp->trans,
2389 (const unsigned char *) "space",
2390 (const unsigned char *) "",
2391 token->type == OP_NOTSPACE, err);
2392 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2393 return NULL;
2394 break;
2395 case OP_ALT:
2396 case END_OF_RE:
2397 return NULL;
2398 case BACK_SLASH:
2399 *err = REG_EESCAPE;
2400 return NULL;
2401 default:
2402 /* Must not happen? */
2403#ifdef DEBUG
2404 assert (0);
2405#endif
2406 return NULL;
2407 }
2408 fetch_token (token, regexp, syntax);
2409
2410 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2411 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2412 {
2413 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2414 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2415 return NULL;
2416 /* In BRE consecutive duplications are not allowed. */
2417 if ((syntax & RE_CONTEXT_INVALID_DUP)
2418 && (token->type == OP_DUP_ASTERISK
2419 || token->type == OP_OPEN_DUP_NUM))
2420 {
2421 *err = REG_BADRPT;
2422 return NULL;
2423 }
2424 }
2425
2426 return tree;
2427}
2428
2429/* This function build the following tree, from regular expression
2430 (<reg_exp>):
2431 SUBEXP
2432 |
2433 <reg_exp>
2434*/
2435
2436static bin_tree_t *
2437parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2438 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2439{
2440 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2441 bin_tree_t *tree;
2442 size_t cur_nsub;
2443 cur_nsub = preg->re_nsub++;
2444
2445 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2446
2447 /* The subexpression may be a null string. */
2448 if (token->type == OP_CLOSE_SUBEXP)
2449 tree = NULL;
2450 else
2451 {
2452 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2453 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2454 *err = REG_EPAREN;
2455 if (BE (*err != REG_NOERROR, 0))
2456 return NULL;
2457 }
2458
2459 if (cur_nsub <= '9' - '1')
2460 dfa->completed_bkref_map |= 1 << cur_nsub;
2461
2462 tree = create_tree (dfa, tree, NULL, SUBEXP);
2463 if (BE (tree == NULL, 0))
2464 {
2465 *err = REG_ESPACE;
2466 return NULL;
2467 }
2468 tree->token.opr.idx = cur_nsub;
2469 return tree;
2470}
2471
2472/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2473
2474static bin_tree_t *
2475parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2476 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2477{
2478 bin_tree_t *tree = NULL, *old_tree = NULL;
2479 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2480 re_token_t start_token = *token;
2481
2482 if (token->type == OP_OPEN_DUP_NUM)
2483 {
2484 end = 0;
2485 start = fetch_number (regexp, token, syntax);
2486 if (start == REG_MISSING)
2487 {
2488 if (token->type == CHARACTER && token->opr.c == ',')
2489 start = 0; /* We treat "{,m}" as "{0,m}". */
2490 else
2491 {
2492 *err = REG_BADBR; /* <re>{} is invalid. */
2493 return NULL;
2494 }
2495 }
2496 if (BE (start != REG_ERROR, 1))
2497 {
2498 /* We treat "{n}" as "{n,n}". */
2499 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2500 : ((token->type == CHARACTER && token->opr.c == ',')
2501 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2502 }
2503 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2504 {
2505 /* Invalid sequence. */
2506 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2507 {
2508 if (token->type == END_OF_RE)
2509 *err = REG_EBRACE;
2510 else
2511 *err = REG_BADBR;
2512
2513 return NULL;
2514 }
2515
2516 /* If the syntax bit is set, rollback. */
2517 re_string_set_index (regexp, start_idx);
2518 *token = start_token;
2519 token->type = CHARACTER;
2520 /* mb_partial and word_char bits should be already initialized by
2521 peek_token. */
2522 return elem;
2523 }
2524
2525 if (BE ((end != REG_MISSING && start > end)
2526 || token->type != OP_CLOSE_DUP_NUM, 0))
2527 {
2528 /* First number greater than second. */
2529 *err = REG_BADBR;
2530 return NULL;
2531 }
2532 }
2533 else
2534 {
2535 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2536 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2537 }
2538
2539 fetch_token (token, regexp, syntax);
2540
2541 if (BE (elem == NULL, 0))
2542 return NULL;
2543 if (BE (start == 0 && end == 0, 0))
2544 {
2545 postorder (elem, free_tree, NULL);
2546 return NULL;
2547 }
2548
2549 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2550 if (BE (start > 0, 0))
2551 {
2552 tree = elem;
2553 for (i = 2; i <= start; ++i)
2554 {
2555 elem = duplicate_tree (elem, dfa);
2556 tree = create_tree (dfa, tree, elem, CONCAT);
2557 if (BE (elem == NULL || tree == NULL, 0))
2558 goto parse_dup_op_espace;
2559 }
2560
2561 if (start == end)
2562 return tree;
2563
2564 /* Duplicate ELEM before it is marked optional. */
2565 elem = duplicate_tree (elem, dfa);
2566 old_tree = tree;
2567 }
2568 else
2569 old_tree = NULL;
2570
2571 if (elem->token.type == SUBEXP)
2572 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2573
2574 tree = create_tree (dfa, elem, NULL,
2575 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2576 if (BE (tree == NULL, 0))
2577 goto parse_dup_op_espace;
2578
2579/* From gnulib's "intprops.h":
2580 True if the arithmetic type T is signed. */
2581#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2582
2583 /* This loop is actually executed only when end != REG_MISSING,
2584 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2585 already created the start+1-th copy. */
2586 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2587 for (i = start + 2; i <= end; ++i)
2588 {
2589 elem = duplicate_tree (elem, dfa);
2590 tree = create_tree (dfa, tree, elem, CONCAT);
2591 if (BE (elem == NULL || tree == NULL, 0))
2592 goto parse_dup_op_espace;
2593
2594 tree = create_tree (dfa, tree, NULL, OP_ALT);
2595 if (BE (tree == NULL, 0))
2596 goto parse_dup_op_espace;
2597 }
2598
2599 if (old_tree)
2600 tree = create_tree (dfa, old_tree, tree, CONCAT);
2601
2602 return tree;
2603
2604 parse_dup_op_espace:
2605 *err = REG_ESPACE;
2606 return NULL;
2607}
2608
2609/* Size of the names for collating symbol/equivalence_class/character_class.
2610 I'm not sure, but maybe enough. */
2611#define BRACKET_NAME_BUF_SIZE 32
2612
2613#ifndef _LIBC
2614 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2615 Build the range expression which starts from START_ELEM, and ends
2616 at END_ELEM. The result are written to MBCSET and SBCSET.
2617 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2618 mbcset->range_ends, is a pointer argument sinse we may
2619 update it. */
2620
2621static reg_errcode_t
2622internal_function
2623# ifdef RE_ENABLE_I18N
2624build_range_exp (const reg_syntax_t syntax,
2625 bitset_t sbcset,
2626 re_charset_t *mbcset,
2627 Idx *range_alloc,
2628 const bracket_elem_t *start_elem,
2629 const bracket_elem_t *end_elem)
2630# else /* not RE_ENABLE_I18N */
2631build_range_exp (const reg_syntax_t syntax,
2632 bitset_t sbcset,
2633 const bracket_elem_t *start_elem,
2634 const bracket_elem_t *end_elem)
2635# endif /* not RE_ENABLE_I18N */
2636{
2637 unsigned int start_ch, end_ch;
2638 /* Equivalence Classes and Character Classes can't be a range start/end. */
2639 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2640 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2641 0))
2642 return REG_ERANGE;
2643
2644 /* We can handle no multi character collating elements without libc
2645 support. */
2646 if (BE ((start_elem->type == COLL_SYM
2647 && strlen ((char *) start_elem->opr.name) > 1)
2648 || (end_elem->type == COLL_SYM
2649 && strlen ((char *) end_elem->opr.name) > 1), 0))
2650 return REG_ECOLLATE;
2651
2652# ifdef RE_ENABLE_I18N
2653 {
2654 wchar_t wc;
2655 wint_t start_wc;
2656 wint_t end_wc;
2657 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2658
2659 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2660 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2661 : 0));
2662 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2663 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2664 : 0));
2665 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2666 ? __btowc (start_ch) : start_elem->opr.wch);
2667 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2668 ? __btowc (end_ch) : end_elem->opr.wch);
2669 if (start_wc == WEOF || end_wc == WEOF)
2670 return REG_ECOLLATE;
2671 cmp_buf[0] = start_wc;
2672 cmp_buf[4] = end_wc;
2673
2674 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2675 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2676 return REG_ERANGE;
2677
2678 /* Got valid collation sequence values, add them as a new entry.
2679 However, for !_LIBC we have no collation elements: if the
2680 character set is single byte, the single byte character set
2681 that we build below suffices. parse_bracket_exp passes
2682 no MBCSET if dfa->mb_cur_max == 1. */
2683 if (mbcset)
2684 {
2685 /* Check the space of the arrays. */
2686 if (BE (*range_alloc == mbcset->nranges, 0))
2687 {
2688 /* There is not enough space, need realloc. */
2689 wchar_t *new_array_start, *new_array_end;
2690 Idx new_nranges;
2691
2692 /* +1 in case of mbcset->nranges is 0. */
2693 new_nranges = 2 * mbcset->nranges + 1;
2694 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2695 are NULL if *range_alloc == 0. */
2696 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2697 new_nranges);
2698 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2699 new_nranges);
2700
2701 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2702 return REG_ESPACE;
2703
2704 mbcset->range_starts = new_array_start;
2705 mbcset->range_ends = new_array_end;
2706 *range_alloc = new_nranges;
2707 }
2708
2709 mbcset->range_starts[mbcset->nranges] = start_wc;
2710 mbcset->range_ends[mbcset->nranges++] = end_wc;
2711 }
2712
2713 /* Build the table for single byte characters. */
2714 for (wc = 0; wc < SBC_MAX; ++wc)
2715 {
2716 cmp_buf[2] = wc;
2717 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2718 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2719 bitset_set (sbcset, wc);
2720 }
2721 }
2722# else /* not RE_ENABLE_I18N */
2723 {
2724 unsigned int ch;
2725 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2726 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2727 : 0));
2728 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2729 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2730 : 0));
2731 if (start_ch > end_ch)
2732 return REG_ERANGE;
2733 /* Build the table for single byte characters. */
2734 for (ch = 0; ch < SBC_MAX; ++ch)
2735 if (start_ch <= ch && ch <= end_ch)
2736 bitset_set (sbcset, ch);
2737 }
2738# endif /* not RE_ENABLE_I18N */
2739 return REG_NOERROR;
2740}
2741#endif /* not _LIBC */
2742
2743#ifndef _LIBC
2744/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2745 Build the collating element which is represented by NAME.
2746 The result are written to MBCSET and SBCSET.
2747 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2748 pointer argument since we may update it. */
2749
2750static reg_errcode_t
2751internal_function
2752build_collating_symbol (bitset_t sbcset,
2753# ifdef RE_ENABLE_I18N
2754 re_charset_t *mbcset, Idx *coll_sym_alloc,
2755# endif
2756 const unsigned char *name)
2757{
2758 size_t name_len = strlen ((const char *) name);
2759 if (BE (name_len != 1, 0))
2760 return REG_ECOLLATE;
2761 else
2762 {
2763 bitset_set (sbcset, name[0]);
2764 return REG_NOERROR;
2765 }
2766}
2767#endif /* not _LIBC */
2768
2769/* This function parse bracket expression like "[abc]", "[a-c]",
2770 "[[.a-a.]]" etc. */
2771
2772static bin_tree_t *
2773parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2774 reg_syntax_t syntax, reg_errcode_t *err)
2775{
2776#ifdef _LIBC
2777 const unsigned char *collseqmb;
2778 const char *collseqwc;
2779 uint32_t nrules;
2780 int32_t table_size;
2781 const int32_t *symb_table;
2782 const unsigned char *extra;
2783
2784 /* Local function for parse_bracket_exp used in _LIBC environement.
2785 Seek the collating symbol entry correspondings to NAME.
2786 Return the index of the symbol in the SYMB_TABLE. */
2787
2788 auto inline int32_t
2789 __attribute ((always_inline))
2790 seek_collating_symbol_entry (name, name_len)
2791 const unsigned char *name;
2792 size_t name_len;
2793 {
2794 int32_t hash = elem_hash ((const char *) name, name_len);
2795 int32_t elem = hash % table_size;
2796 if (symb_table[2 * elem] != 0)
2797 {
2798 int32_t second = hash % (table_size - 2) + 1;
2799
2800 do
2801 {
2802 /* First compare the hashing value. */
2803 if (symb_table[2 * elem] == hash
2804 /* Compare the length of the name. */
2805 && name_len == extra[symb_table[2 * elem + 1]]
2806 /* Compare the name. */
2807 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2808 name_len) == 0)
2809 {
2810 /* Yep, this is the entry. */
2811 break;
2812 }
2813
2814 /* Next entry. */
2815 elem += second;
2816 }
2817 while (symb_table[2 * elem] != 0);
2818 }
2819 return elem;
2820 }
2821
2822 /* Local function for parse_bracket_exp used in _LIBC environment.
2823 Look up the collation sequence value of BR_ELEM.
2824 Return the value if succeeded, UINT_MAX otherwise. */
2825
2826 auto inline unsigned int
2827 __attribute ((always_inline))
2828 lookup_collation_sequence_value (br_elem)
2829 bracket_elem_t *br_elem;
2830 {
2831 if (br_elem->type == SB_CHAR)
2832 {
2833 /*
2834 if (MB_CUR_MAX == 1)
2835 */
2836 if (nrules == 0)
2837 return collseqmb[br_elem->opr.ch];
2838 else
2839 {
2840 wint_t wc = __btowc (br_elem->opr.ch);
2841 return __collseq_table_lookup (collseqwc, wc);
2842 }
2843 }
2844 else if (br_elem->type == MB_CHAR)
2845 {
2846 if (nrules != 0)
2847 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2848 }
2849 else if (br_elem->type == COLL_SYM)
2850 {
2851 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2852 if (nrules != 0)
2853 {
2854 int32_t elem, idx;
2855 elem = seek_collating_symbol_entry (br_elem->opr.name,
2856 sym_name_len);
2857 if (symb_table[2 * elem] != 0)
2858 {
2859 /* We found the entry. */
2860 idx = symb_table[2 * elem + 1];
2861 /* Skip the name of collating element name. */
2862 idx += 1 + extra[idx];
2863 /* Skip the byte sequence of the collating element. */
2864 idx += 1 + extra[idx];
2865 /* Adjust for the alignment. */
2866 idx = (idx + 3) & ~3;
2867 /* Skip the multibyte collation sequence value. */
2868 idx += sizeof (unsigned int);
2869 /* Skip the wide char sequence of the collating element. */
2870 idx += sizeof (unsigned int) *
2871 (1 + *(unsigned int *) (extra + idx));
2872 /* Return the collation sequence value. */
2873 return *(unsigned int *) (extra + idx);
2874 }
2875 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2876 {
2877 /* No valid character. Match it as a single byte
2878 character. */
2879 return collseqmb[br_elem->opr.name[0]];
2880 }
2881 }
2882 else if (sym_name_len == 1)
2883 return collseqmb[br_elem->opr.name[0]];
2884 }
2885 return UINT_MAX;
2886 }
2887
2888 /* Local function for parse_bracket_exp used in _LIBC environement.
2889 Build the range expression which starts from START_ELEM, and ends
2890 at END_ELEM. The result are written to MBCSET and SBCSET.
2891 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2892 mbcset->range_ends, is a pointer argument sinse we may
2893 update it. */
2894
2895 auto inline reg_errcode_t
2896 __attribute ((always_inline))
2897 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2898 re_charset_t *mbcset;
2899 Idx *range_alloc;
2900 bitset_t sbcset;
2901 bracket_elem_t *start_elem, *end_elem;
2902 {
2903 unsigned int ch;
2904 uint32_t start_collseq;
2905 uint32_t end_collseq;
2906
2907 /* Equivalence Classes and Character Classes can't be a range
2908 start/end. */
2909 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2910 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2911 0))
2912 return REG_ERANGE;
2913
2914 start_collseq = lookup_collation_sequence_value (start_elem);
2915 end_collseq = lookup_collation_sequence_value (end_elem);
2916 /* Check start/end collation sequence values. */
2917 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2918 return REG_ECOLLATE;
2919 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2920 return REG_ERANGE;
2921
2922 /* Got valid collation sequence values, add them as a new entry.
2923 However, if we have no collation elements, and the character set
2924 is single byte, the single byte character set that we
2925 build below suffices. */
2926 if (nrules > 0 || dfa->mb_cur_max > 1)
2927 {
2928 /* Check the space of the arrays. */
2929 if (BE (*range_alloc == mbcset->nranges, 0))
2930 {
2931 /* There is not enough space, need realloc. */
2932 uint32_t *new_array_start;
2933 uint32_t *new_array_end;
2934 Idx new_nranges;
2935
2936 /* +1 in case of mbcset->nranges is 0. */
2937 new_nranges = 2 * mbcset->nranges + 1;
2938 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2939 new_nranges);
2940 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2941 new_nranges);
2942
2943 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2944 return REG_ESPACE;
2945
2946 mbcset->range_starts = new_array_start;
2947 mbcset->range_ends = new_array_end;
2948 *range_alloc = new_nranges;
2949 }
2950
2951 mbcset->range_starts[mbcset->nranges] = start_collseq;
2952 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2953 }
2954
2955 /* Build the table for single byte characters. */
2956 for (ch = 0; ch < SBC_MAX; ch++)
2957 {
2958 uint32_t ch_collseq;
2959 /*
2960 if (MB_CUR_MAX == 1)
2961 */
2962 if (nrules == 0)
2963 ch_collseq = collseqmb[ch];
2964 else
2965 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2966 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2967 bitset_set (sbcset, ch);
2968 }
2969 return REG_NOERROR;
2970 }
2971
2972 /* Local function for parse_bracket_exp used in _LIBC environement.
2973 Build the collating element which is represented by NAME.
2974 The result are written to MBCSET and SBCSET.
2975 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2976 pointer argument sinse we may update it. */
2977
2978 auto inline reg_errcode_t
2979 __attribute ((always_inline))
2980 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2981 re_charset_t *mbcset;
2982 Idx *coll_sym_alloc;
2983 bitset_t sbcset;
2984 const unsigned char *name;
2985 {
2986 int32_t elem, idx;
2987 size_t name_len = strlen ((const char *) name);
2988 if (nrules != 0)
2989 {
2990 elem = seek_collating_symbol_entry (name, name_len);
2991 if (symb_table[2 * elem] != 0)
2992 {
2993 /* We found the entry. */
2994 idx = symb_table[2 * elem + 1];
2995 /* Skip the name of collating element name. */
2996 idx += 1 + extra[idx];
2997 }
2998 else if (symb_table[2 * elem] == 0 && name_len == 1)
2999 {
3000 /* No valid character, treat it as a normal
3001 character. */
3002 bitset_set (sbcset, name[0]);
3003 return REG_NOERROR;
3004 }
3005 else
3006 return REG_ECOLLATE;
3007
3008 /* Got valid collation sequence, add it as a new entry. */
3009 /* Check the space of the arrays. */
3010 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3011 {
3012 /* Not enough, realloc it. */
3013 /* +1 in case of mbcset->ncoll_syms is 0. */
3014 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3015 /* Use realloc since mbcset->coll_syms is NULL
3016 if *alloc == 0. */
3017 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3018 new_coll_sym_alloc);
3019 if (BE (new_coll_syms == NULL, 0))
3020 return REG_ESPACE;
3021 mbcset->coll_syms = new_coll_syms;
3022 *coll_sym_alloc = new_coll_sym_alloc;
3023 }
3024 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3025 return REG_NOERROR;
3026 }
3027 else
3028 {
3029 if (BE (name_len != 1, 0))
3030 return REG_ECOLLATE;
3031 else
3032 {
3033 bitset_set (sbcset, name[0]);
3034 return REG_NOERROR;
3035 }
3036 }
3037 }
3038#endif
3039
3040 re_token_t br_token;
3041 re_bitset_ptr_t sbcset;
3042#ifdef RE_ENABLE_I18N
3043 re_charset_t *mbcset;
3044 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3045 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3046#endif /* not RE_ENABLE_I18N */
3047 bool non_match = false;
3048 bin_tree_t *work_tree;
3049 int token_len;
3050 bool first_round = true;
3051#ifdef _LIBC
3052 collseqmb = (const unsigned char *)
3053 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3054 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3055 if (nrules)
3056 {
3057 /*
3058 if (MB_CUR_MAX > 1)
3059 */
3060 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3061 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3062 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3063 _NL_COLLATE_SYMB_TABLEMB);
3064 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3065 _NL_COLLATE_SYMB_EXTRAMB);
3066 }
3067#endif
3068 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3069#ifdef RE_ENABLE_I18N
3070 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3071#endif /* RE_ENABLE_I18N */
3072#ifdef RE_ENABLE_I18N
3073 if (BE (sbcset == NULL || mbcset == NULL, 0))
3074#else
3075 if (BE (sbcset == NULL, 0))
3076#endif /* RE_ENABLE_I18N */
3077 {
3078 *err = REG_ESPACE;
3079 return NULL;
3080 }
3081
3082 token_len = peek_token_bracket (token, regexp, syntax);
3083 if (BE (token->type == END_OF_RE, 0))
3084 {
3085 *err = REG_BADPAT;
3086 goto parse_bracket_exp_free_return;
3087 }
3088 if (token->type == OP_NON_MATCH_LIST)
3089 {
3090#ifdef RE_ENABLE_I18N
3091 mbcset->non_match = 1;
3092#endif /* not RE_ENABLE_I18N */
3093 non_match = true;
3094 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3095 bitset_set (sbcset, '\n');
3096 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3097 token_len = peek_token_bracket (token, regexp, syntax);
3098 if (BE (token->type == END_OF_RE, 0))
3099 {
3100 *err = REG_BADPAT;
3101 goto parse_bracket_exp_free_return;
3102 }
3103 }
3104
3105 /* We treat the first ']' as a normal character. */
3106 if (token->type == OP_CLOSE_BRACKET)
3107 token->type = CHARACTER;
3108
3109 while (1)
3110 {
3111 bracket_elem_t start_elem, end_elem;
3112 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3113 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3114 reg_errcode_t ret;
3115 int token_len2 = 0;
3116 bool is_range_exp = false;
3117 re_token_t token2;
3118
3119 start_elem.opr.name = start_name_buf;
3120 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3121 syntax, first_round);
3122 if (BE (ret != REG_NOERROR, 0))
3123 {
3124 *err = ret;
3125 goto parse_bracket_exp_free_return;
3126 }
3127 first_round = false;
3128
3129 /* Get information about the next token. We need it in any case. */
3130 token_len = peek_token_bracket (token, regexp, syntax);
3131
3132 /* Do not check for ranges if we know they are not allowed. */
3133 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3134 {
3135 if (BE (token->type == END_OF_RE, 0))
3136 {
3137 *err = REG_EBRACK;
3138 goto parse_bracket_exp_free_return;
3139 }
3140 if (token->type == OP_CHARSET_RANGE)
3141 {
3142 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3143 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3144 if (BE (token2.type == END_OF_RE, 0))
3145 {
3146 *err = REG_EBRACK;
3147 goto parse_bracket_exp_free_return;
3148 }
3149 if (token2.type == OP_CLOSE_BRACKET)
3150 {
3151 /* We treat the last '-' as a normal character. */
3152 re_string_skip_bytes (regexp, -token_len);
3153 token->type = CHARACTER;
3154 }
3155 else
3156 is_range_exp = true;
3157 }
3158 }
3159
3160 if (is_range_exp == true)
3161 {
3162 end_elem.opr.name = end_name_buf;
3163 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3164 dfa, syntax, true);
3165 if (BE (ret != REG_NOERROR, 0))
3166 {
3167 *err = ret;
3168 goto parse_bracket_exp_free_return;
3169 }
3170
3171 token_len = peek_token_bracket (token, regexp, syntax);
3172
3173#ifdef _LIBC
3174 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3175 &start_elem, &end_elem);
3176#else
3177# ifdef RE_ENABLE_I18N
3178 *err = build_range_exp (syntax, sbcset,
3179 dfa->mb_cur_max > 1 ? mbcset : NULL,
3180 &range_alloc, &start_elem, &end_elem);
3181# else
3182 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3183# endif
3184#endif /* RE_ENABLE_I18N */
3185 if (BE (*err != REG_NOERROR, 0))
3186 goto parse_bracket_exp_free_return;
3187 }
3188 else
3189 {
3190 switch (start_elem.type)
3191 {
3192 case SB_CHAR:
3193 bitset_set (sbcset, start_elem.opr.ch);
3194 break;
3195#ifdef RE_ENABLE_I18N
3196 case MB_CHAR:
3197 /* Check whether the array has enough space. */
3198 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3199 {
3200 wchar_t *new_mbchars;
3201 /* Not enough, realloc it. */
3202 /* +1 in case of mbcset->nmbchars is 0. */
3203 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3204 /* Use realloc since array is NULL if *alloc == 0. */
3205 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3206 mbchar_alloc);
3207 if (BE (new_mbchars == NULL, 0))
3208 goto parse_bracket_exp_espace;
3209 mbcset->mbchars = new_mbchars;
3210 }
3211 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3212 break;
3213#endif /* RE_ENABLE_I18N */
3214 case EQUIV_CLASS:
3215 *err = build_equiv_class (sbcset,
3216#ifdef RE_ENABLE_I18N
3217 mbcset, &equiv_class_alloc,
3218#endif /* RE_ENABLE_I18N */
3219 start_elem.opr.name);
3220 if (BE (*err != REG_NOERROR, 0))
3221 goto parse_bracket_exp_free_return;
3222 break;
3223 case COLL_SYM:
3224 *err = build_collating_symbol (sbcset,
3225#ifdef RE_ENABLE_I18N
3226 mbcset, &coll_sym_alloc,
3227#endif /* RE_ENABLE_I18N */
3228 start_elem.opr.name);
3229 if (BE (*err != REG_NOERROR, 0))
3230 goto parse_bracket_exp_free_return;
3231 break;
3232 case CHAR_CLASS:
3233 *err = build_charclass (regexp->trans, sbcset,
3234#ifdef RE_ENABLE_I18N
3235 mbcset, &char_class_alloc,
3236#endif /* RE_ENABLE_I18N */
3237 start_elem.opr.name, syntax);
3238 if (BE (*err != REG_NOERROR, 0))
3239 goto parse_bracket_exp_free_return;
3240 break;
3241 default:
3242 assert (0);
3243 break;
3244 }
3245 }
3246 if (BE (token->type == END_OF_RE, 0))
3247 {
3248 *err = REG_EBRACK;
3249 goto parse_bracket_exp_free_return;
3250 }
3251 if (token->type == OP_CLOSE_BRACKET)
3252 break;
3253 }
3254
3255 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3256
3257 /* If it is non-matching list. */
3258 if (non_match)
3259 bitset_not (sbcset);
3260
3261#ifdef RE_ENABLE_I18N
3262 /* Ensure only single byte characters are set. */
3263 if (dfa->mb_cur_max > 1)
3264 bitset_mask (sbcset, dfa->sb_char);
3265
3266 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3267 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3268 || mbcset->non_match)))
3269 {
3270 bin_tree_t *mbc_tree;
3271 int sbc_idx;
3272 /* Build a tree for complex bracket. */
3273 dfa->has_mb_node = 1;
3274 br_token.type = COMPLEX_BRACKET;
3275 br_token.opr.mbcset = mbcset;
3276 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3277 if (BE (mbc_tree == NULL, 0))
3278 goto parse_bracket_exp_espace;
3279 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3280 if (sbcset[sbc_idx])
3281 break;
3282 /* If there are no bits set in sbcset, there is no point
3283 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3284 if (sbc_idx < BITSET_WORDS)
3285 {
3286 /* Build a tree for simple bracket. */
3287 br_token.type = SIMPLE_BRACKET;
3288 br_token.opr.sbcset = sbcset;
3289 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3290 if (BE (work_tree == NULL, 0))
3291 goto parse_bracket_exp_espace;
3292
3293 /* Then join them by ALT node. */
3294 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3295 if (BE (work_tree == NULL, 0))
3296 goto parse_bracket_exp_espace;
3297 }
3298 else
3299 {
3300 re_free (sbcset);
3301 work_tree = mbc_tree;
3302 }
3303 }
3304 else
3305#endif /* not RE_ENABLE_I18N */
3306 {
3307#ifdef RE_ENABLE_I18N
3308 free_charset (mbcset);
3309#endif
3310 /* Build a tree for simple bracket. */
3311 br_token.type = SIMPLE_BRACKET;
3312 br_token.opr.sbcset = sbcset;
3313 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3314 if (BE (work_tree == NULL, 0))
3315 goto parse_bracket_exp_espace;
3316 }
3317 return work_tree;
3318
3319 parse_bracket_exp_espace:
3320 *err = REG_ESPACE;
3321 parse_bracket_exp_free_return:
3322 re_free (sbcset);
3323#ifdef RE_ENABLE_I18N
3324 free_charset (mbcset);
3325#endif /* RE_ENABLE_I18N */
3326 return NULL;
3327}
3328
3329/* Parse an element in the bracket expression. */
3330
3331static reg_errcode_t
3332parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3333 re_token_t *token, int token_len, re_dfa_t *dfa,
3334 reg_syntax_t syntax, bool accept_hyphen)
3335{
3336#ifdef RE_ENABLE_I18N
3337 int cur_char_size;
3338 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3339 if (cur_char_size > 1)
3340 {
3341 elem->type = MB_CHAR;
3342 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3343 re_string_skip_bytes (regexp, cur_char_size);
3344 return REG_NOERROR;
3345 }
3346#endif /* RE_ENABLE_I18N */
3347 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3348 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3349 || token->type == OP_OPEN_EQUIV_CLASS)
3350 return parse_bracket_symbol (elem, regexp, token);
3351 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3352 {
3353 /* A '-' must only appear as anything but a range indicator before
3354 the closing bracket. Everything else is an error. */
3355 re_token_t token2;
3356 (void) peek_token_bracket (&token2, regexp, syntax);
3357 if (token2.type != OP_CLOSE_BRACKET)
3358 /* The actual error value is not standardized since this whole
3359 case is undefined. But ERANGE makes good sense. */
3360 return REG_ERANGE;
3361 }
3362 elem->type = SB_CHAR;
3363 elem->opr.ch = token->opr.c;
3364 return REG_NOERROR;
3365}
3366
3367/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3368 such as [:<character_class>:], [.<collating_element>.], and
3369 [=<equivalent_class>=]. */
3370
3371static reg_errcode_t
3372parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3373 re_token_t *token)
3374{
3375 unsigned char ch, delim = token->opr.c;
3376 int i = 0;
3377 if (re_string_eoi(regexp))
3378 return REG_EBRACK;
3379 for (;; ++i)
3380 {
3381 if (i >= BRACKET_NAME_BUF_SIZE)
3382 return REG_EBRACK;
3383 if (token->type == OP_OPEN_CHAR_CLASS)
3384 ch = re_string_fetch_byte_case (regexp);
3385 else
3386 ch = re_string_fetch_byte (regexp);
3387 if (re_string_eoi(regexp))
3388 return REG_EBRACK;
3389 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3390 break;
3391 elem->opr.name[i] = ch;
3392 }
3393 re_string_skip_bytes (regexp, 1);
3394 elem->opr.name[i] = '\0';
3395 switch (token->type)
3396 {
3397 case OP_OPEN_COLL_ELEM:
3398 elem->type = COLL_SYM;
3399 break;
3400 case OP_OPEN_EQUIV_CLASS:
3401 elem->type = EQUIV_CLASS;
3402 break;
3403 case OP_OPEN_CHAR_CLASS:
3404 elem->type = CHAR_CLASS;
3405 break;
3406 default:
3407 break;
3408 }
3409 return REG_NOERROR;
3410}
3411
3412 /* Helper function for parse_bracket_exp.
3413 Build the equivalence class which is represented by NAME.
3414 The result are written to MBCSET and SBCSET.
3415 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3416 is a pointer argument sinse we may update it. */
3417
3418static reg_errcode_t
3419#ifdef RE_ENABLE_I18N
3420build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3421 Idx *equiv_class_alloc, const unsigned char *name)
3422#else /* not RE_ENABLE_I18N */
3423build_equiv_class (bitset_t sbcset, const unsigned char *name)
3424#endif /* not RE_ENABLE_I18N */
3425{
3426#ifdef _LIBC
3427 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3428 if (nrules != 0)
3429 {
3430 const int32_t *table, *indirect;
3431 const unsigned char *weights, *extra, *cp;
3432 unsigned char char_buf[2];
3433 int32_t idx1, idx2;
3434 unsigned int ch;
3435 size_t len;
3436 /* This #include defines a local function! */
3437# include <locale/weight.h>
3438 /* Calculate the index for equivalence class. */
3439 cp = name;
3440 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3441 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3442 _NL_COLLATE_WEIGHTMB);
3443 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3444 _NL_COLLATE_EXTRAMB);
3445 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3446 _NL_COLLATE_INDIRECTMB);
3447 idx1 = findidx (&cp);
3448 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3449 /* This isn't a valid character. */
3450 return REG_ECOLLATE;
3451
3452 /* Build single byte matcing table for this equivalence class. */
3453 char_buf[1] = (unsigned char) '\0';
3454 len = weights[idx1 & 0xffffff];
3455 for (ch = 0; ch < SBC_MAX; ++ch)
3456 {
3457 char_buf[0] = ch;
3458 cp = char_buf;
3459 idx2 = findidx (&cp);
3460/*
3461 idx2 = table[ch];
3462*/
3463 if (idx2 == 0)
3464 /* This isn't a valid character. */
3465 continue;
3466 /* Compare only if the length matches and the collation rule
3467 index is the same. */
3468 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3469 {
3470 int cnt = 0;
3471
3472 while (cnt <= len &&
3473 weights[(idx1 & 0xffffff) + 1 + cnt]
3474 == weights[(idx2 & 0xffffff) + 1 + cnt])
3475 ++cnt;
3476
3477 if (cnt > len)
3478 bitset_set (sbcset, ch);
3479 }
3480 }
3481 /* Check whether the array has enough space. */
3482 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3483 {
3484 /* Not enough, realloc it. */
3485 /* +1 in case of mbcset->nequiv_classes is 0. */
3486 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3487 /* Use realloc since the array is NULL if *alloc == 0. */
3488 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3489 int32_t,
3490 new_equiv_class_alloc);
3491 if (BE (new_equiv_classes == NULL, 0))
3492 return REG_ESPACE;
3493 mbcset->equiv_classes = new_equiv_classes;
3494 *equiv_class_alloc = new_equiv_class_alloc;
3495 }
3496 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3497 }
3498 else
3499#endif /* _LIBC */
3500 {
3501 if (BE (strlen ((const char *) name) != 1, 0))
3502 return REG_ECOLLATE;
3503 bitset_set (sbcset, *name);
3504 }
3505 return REG_NOERROR;
3506}
3507
3508 /* Helper function for parse_bracket_exp.
3509 Build the character class which is represented by NAME.
3510 The result are written to MBCSET and SBCSET.
3511 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3512 is a pointer argument sinse we may update it. */
3513
3514static reg_errcode_t
3515#ifdef RE_ENABLE_I18N
3516build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3517 re_charset_t *mbcset, Idx *char_class_alloc,
3518 const unsigned char *class_name, reg_syntax_t syntax)
3519#else /* not RE_ENABLE_I18N */
3520build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3521 const unsigned char *class_name, reg_syntax_t syntax)
3522#endif /* not RE_ENABLE_I18N */
3523{
3524 int i;
3525 const char *name = (const char *) class_name;
3526
3527 /* In case of REG_ICASE "upper" and "lower" match the both of
3528 upper and lower cases. */
3529 if ((syntax & RE_ICASE)
3530 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3531 name = "alpha";
3532
3533#ifdef RE_ENABLE_I18N
3534 /* Check the space of the arrays. */
3535 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3536 {
3537 /* Not enough, realloc it. */
3538 /* +1 in case of mbcset->nchar_classes is 0. */
3539 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3540 /* Use realloc since array is NULL if *alloc == 0. */
3541 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3542 new_char_class_alloc);
3543 if (BE (new_char_classes == NULL, 0))
3544 return REG_ESPACE;
3545 mbcset->char_classes = new_char_classes;
3546 *char_class_alloc = new_char_class_alloc;
3547 }
3548 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3549#endif /* RE_ENABLE_I18N */
3550
3551#define BUILD_CHARCLASS_LOOP(ctype_func) \
3552 do { \
3553 if (BE (trans != NULL, 0)) \
3554 { \
3555 for (i = 0; i < SBC_MAX; ++i) \
3556 if (ctype_func (i)) \
3557 bitset_set (sbcset, trans[i]); \
3558 } \
3559 else \
3560 { \
3561 for (i = 0; i < SBC_MAX; ++i) \
3562 if (ctype_func (i)) \
3563 bitset_set (sbcset, i); \
3564 } \
3565 } while (0)
3566
3567 if (strcmp (name, "alnum") == 0)
3568 BUILD_CHARCLASS_LOOP (isalnum);
3569 else if (strcmp (name, "cntrl") == 0)
3570 BUILD_CHARCLASS_LOOP (iscntrl);
3571 else if (strcmp (name, "lower") == 0)
3572 BUILD_CHARCLASS_LOOP (islower);
3573 else if (strcmp (name, "space") == 0)
3574 BUILD_CHARCLASS_LOOP (isspace);
3575 else if (strcmp (name, "alpha") == 0)
3576 BUILD_CHARCLASS_LOOP (isalpha);
3577 else if (strcmp (name, "digit") == 0)
3578 BUILD_CHARCLASS_LOOP (isdigit);
3579 else if (strcmp (name, "print") == 0)
3580 BUILD_CHARCLASS_LOOP (isprint);
3581 else if (strcmp (name, "upper") == 0)
3582 BUILD_CHARCLASS_LOOP (isupper);
3583 else if (strcmp (name, "blank") == 0)
3584 BUILD_CHARCLASS_LOOP (isblank);
3585 else if (strcmp (name, "graph") == 0)
3586 BUILD_CHARCLASS_LOOP (isgraph);
3587 else if (strcmp (name, "punct") == 0)
3588 BUILD_CHARCLASS_LOOP (ispunct);
3589 else if (strcmp (name, "xdigit") == 0)
3590 BUILD_CHARCLASS_LOOP (isxdigit);
3591 else
3592 return REG_ECTYPE;
3593
3594 return REG_NOERROR;
3595}
3596
3597static bin_tree_t *
3598build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3599 const unsigned char *class_name,
3600 const unsigned char *extra, bool non_match,
3601 reg_errcode_t *err)
3602{
3603 re_bitset_ptr_t sbcset;
3604#ifdef RE_ENABLE_I18N
3605 re_charset_t *mbcset;
3606 Idx alloc = 0;
3607#endif /* not RE_ENABLE_I18N */
3608 reg_errcode_t ret;
3609 re_token_t br_token;
3610 bin_tree_t *tree;
3611
3612 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3613#ifdef RE_ENABLE_I18N
3614 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3615#endif /* RE_ENABLE_I18N */
3616
3617#ifdef RE_ENABLE_I18N
3618 if (BE (sbcset == NULL || mbcset == NULL, 0))
3619#else /* not RE_ENABLE_I18N */
3620 if (BE (sbcset == NULL, 0))
3621#endif /* not RE_ENABLE_I18N */
3622 {
3623 *err = REG_ESPACE;
3624 return NULL;
3625 }
3626
3627 if (non_match)
3628 {
3629#ifdef RE_ENABLE_I18N
3630 mbcset->non_match = 1;
3631#endif /* not RE_ENABLE_I18N */
3632 }
3633
3634 /* We don't care the syntax in this case. */
3635 ret = build_charclass (trans, sbcset,
3636#ifdef RE_ENABLE_I18N
3637 mbcset, &alloc,
3638#endif /* RE_ENABLE_I18N */
3639 class_name, 0);
3640
3641 if (BE (ret != REG_NOERROR, 0))
3642 {
3643 re_free (sbcset);
3644#ifdef RE_ENABLE_I18N
3645 free_charset (mbcset);
3646#endif /* RE_ENABLE_I18N */
3647 *err = ret;
3648 return NULL;
3649 }
3650 /* \w match '_' also. */
3651 for (; *extra; extra++)
3652 bitset_set (sbcset, *extra);
3653
3654 /* If it is non-matching list. */
3655 if (non_match)
3656 bitset_not (sbcset);
3657
3658#ifdef RE_ENABLE_I18N
3659 /* Ensure only single byte characters are set. */
3660 if (dfa->mb_cur_max > 1)
3661 bitset_mask (sbcset, dfa->sb_char);
3662#endif
3663
3664 /* Build a tree for simple bracket. */
3665 br_token.type = SIMPLE_BRACKET;
3666 br_token.opr.sbcset = sbcset;
3667 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3668 if (BE (tree == NULL, 0))
3669 goto build_word_op_espace;
3670
3671#ifdef RE_ENABLE_I18N
3672 if (dfa->mb_cur_max > 1)
3673 {
3674 bin_tree_t *mbc_tree;
3675 /* Build a tree for complex bracket. */
3676 br_token.type = COMPLEX_BRACKET;
3677 br_token.opr.mbcset = mbcset;
3678 dfa->has_mb_node = 1;
3679 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3680 if (BE (mbc_tree == NULL, 0))
3681 goto build_word_op_espace;
3682 /* Then join them by ALT node. */
3683 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3684 if (BE (mbc_tree != NULL, 1))
3685 return tree;
3686 }
3687 else
3688 {
3689 free_charset (mbcset);
3690 return tree;
3691 }
3692#else /* not RE_ENABLE_I18N */
3693 return tree;
3694#endif /* not RE_ENABLE_I18N */
3695
3696 build_word_op_espace:
3697 re_free (sbcset);
3698#ifdef RE_ENABLE_I18N
3699 free_charset (mbcset);
3700#endif /* RE_ENABLE_I18N */
3701 *err = REG_ESPACE;
3702 return NULL;
3703}
3704
3705/* This is intended for the expressions like "a{1,3}".
3706 Fetch a number from 'input', and return the number.
3707 Return REG_MISSING if the number field is empty like "{,1}".
3708 Return REG_ERROR if an error occurred. */
3709
3710static Idx
3711fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3712{
3713 Idx num = REG_MISSING;
3714 unsigned char c;
3715 while (1)
3716 {
3717 fetch_token (token, input, syntax);
3718 c = token->opr.c;
3719 if (BE (token->type == END_OF_RE, 0))
3720 return REG_ERROR;
3721 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3722 break;
3723 num = ((token->type != CHARACTER || c < '0' || '9' < c
3724 || num == REG_ERROR)
3725 ? REG_ERROR
3726 : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3727 num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3728 }
3729 return num;
3730}
3731\f
3732#ifdef RE_ENABLE_I18N
3733static void
3734free_charset (re_charset_t *cset)
3735{
3736 re_free (cset->mbchars);
3737# ifdef _LIBC
3738 re_free (cset->coll_syms);
3739 re_free (cset->equiv_classes);
3740 re_free (cset->range_starts);
3741 re_free (cset->range_ends);
3742# endif
3743 re_free (cset->char_classes);
3744 re_free (cset);
3745}
3746#endif /* RE_ENABLE_I18N */
3747\f
3748/* Functions for binary tree operation. */
3749
3750/* Create a tree node. */
3751
3752static bin_tree_t *
3753create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3754 re_token_type_t type)
3755{
3756 re_token_t t;
3757 t.type = type;
3758 return create_token_tree (dfa, left, right, &t);
3759}
3760
3761static bin_tree_t *
3762create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3763 const re_token_t *token)
3764{
3765 bin_tree_t *tree;
3766 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3767 {
3768 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3769
3770 if (storage == NULL)
3771 return NULL;
3772 storage->next = dfa->str_tree_storage;
3773 dfa->str_tree_storage = storage;
3774 dfa->str_tree_storage_idx = 0;
3775 }
3776 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3777
3778 tree->parent = NULL;
3779 tree->left = left;
3780 tree->right = right;
3781 tree->token = *token;
3782 tree->token.duplicated = 0;
3783 tree->token.opt_subexp = 0;
3784 tree->first = NULL;
3785 tree->next = NULL;
3786 tree->node_idx = REG_MISSING;
3787
3788 if (left != NULL)
3789 left->parent = tree;
3790 if (right != NULL)
3791 right->parent = tree;
3792 return tree;
3793}
3794
3795/* Mark the tree SRC as an optional subexpression.
3796 To be called from preorder or postorder. */
3797
3798static reg_errcode_t
3799mark_opt_subexp (void *extra, bin_tree_t *node)
3800{
3801 Idx idx = (Idx) (long) extra;
3802 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3803 node->token.opt_subexp = 1;
3804
3805 return REG_NOERROR;
3806}
3807
3808/* Free the allocated memory inside NODE. */
3809
3810static void
3811free_token (re_token_t *node)
3812{
3813#ifdef RE_ENABLE_I18N
3814 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3815 free_charset (node->opr.mbcset);
3816 else
3817#endif /* RE_ENABLE_I18N */
3818 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3819 re_free (node->opr.sbcset);
3820}
3821
3822/* Worker function for tree walking. Free the allocated memory inside NODE
3823 and its children. */
3824
3825static reg_errcode_t
3826free_tree (void *extra, bin_tree_t *node)
3827{
3828 free_token (&node->token);
3829 return REG_NOERROR;
3830}
3831
3832
3833/* Duplicate the node SRC, and return new node. This is a preorder
3834 visit similar to the one implemented by the generic visitor, but
3835 we need more infrastructure to maintain two parallel trees --- so,
3836 it's easier to duplicate. */
3837
3838static bin_tree_t *
3839duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3840{
3841 const bin_tree_t *node;
3842 bin_tree_t *dup_root;
3843 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3844
3845 for (node = root; ; )
3846 {
3847 /* Create a new tree and link it back to the current parent. */
3848 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3849 if (*p_new == NULL)
3850 return NULL;
3851 (*p_new)->parent = dup_node;
3852 (*p_new)->token.duplicated = 1;
3853 dup_node = *p_new;
3854
3855 /* Go to the left node, or up and to the right. */
3856 if (node->left)
3857 {
3858 node = node->left;
3859 p_new = &dup_node->left;
3860 }
3861 else
3862 {
3863 const bin_tree_t *prev = NULL;
3864 while (node->right == prev || node->right == NULL)
3865 {
3866 prev = node;
3867 node = node->parent;
3868 dup_node = dup_node->parent;
3869 if (!node)
3870 return dup_root;
3871 }
3872 node = node->right;
3873 p_new = &dup_node->right;
3874 }
3875 }
3876}