add getlogin from gnulib
[bpt/guile.git] / lib / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2013 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 Idx *char_class_alloc,
97 const char *class_name,
98 reg_syntax_t syntax);
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const char *class_name,
110 const char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 \f
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
127
128 static const char __re_error_msgid[] =
129 {
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
181
182 static const size_t __re_error_msgid_idx[] =
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202 \f
203 /* Entry points for GNU code. */
204
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
208
209 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
210 are set in BUFP on entry. */
211
212 #ifdef _LIBC
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
219 const char *
220 re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
222 #endif
223 {
224 reg_errcode_t ret;
225
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
233
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235
236 if (!ret)
237 return NULL;
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239 }
240 #ifdef _LIBC
241 weak_alias (__re_compile_pattern, re_compile_pattern)
242 #endif
243
244 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
250
251
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
255
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
258
259 reg_syntax_t
260 re_set_syntax (syntax)
261 reg_syntax_t syntax;
262 {
263 reg_syntax_t ret = re_syntax_options;
264
265 re_syntax_options = syntax;
266 return ret;
267 }
268 #ifdef _LIBC
269 weak_alias (__re_set_syntax, re_set_syntax)
270 #endif
271
272 int
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
275 {
276 re_dfa_t *dfa = bufp->buffer;
277 char *fastmap = bufp->fastmap;
278
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
288 return 0;
289 }
290 #ifdef _LIBC
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
292 #endif
293
294 static inline void
295 __attribute ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
297 {
298 fastmap[ch] = 1;
299 if (icase)
300 fastmap[tolower (ch)] = 1;
301 }
302
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
305
306 static void
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 char *fastmap)
309 {
310 re_dfa_t *dfa = bufp->buffer;
311 Idx node_cnt;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 {
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
317
318 if (type == CHARACTER)
319 {
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 {
324 unsigned char buf[MB_LEN_MAX];
325 unsigned char *p;
326 wchar_t wc;
327 mbstate_t state;
328
329 p = buf;
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (__mbrtowc (&wc, (const char *) buf, p - buf,
337 &state) == p - buf
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 != (size_t) -1))
340 re_set_fastmap (fastmap, false, buf[0]);
341 }
342 #endif
343 }
344 else if (type == SIMPLE_BRACKET)
345 {
346 int i, ch;
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348 {
349 int j;
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
354 }
355 }
356 #ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
358 {
359 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
360 Idx i;
361
362 # ifdef _LIBC
363 /* See if we have to try all bytes which start multiple collation
364 elements.
365 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 collation element, and don't catch 'b' since 'b' is
367 the only collation element which starts from 'b' (and
368 it is caught by SIMPLE_BRACKET). */
369 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370 && (cset->ncoll_syms || cset->nranges))
371 {
372 const int32_t *table = (const int32_t *)
373 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 for (i = 0; i < SBC_MAX; ++i)
375 if (table[i] < 0)
376 re_set_fastmap (fastmap, icase, i);
377 }
378 # endif /* _LIBC */
379
380 /* See if we have to start the match at all multibyte characters,
381 i.e. where we would not find an invalid sequence. This only
382 applies to multibyte character sets; for single byte character
383 sets, the SIMPLE_BRACKET again suffices. */
384 if (dfa->mb_cur_max > 1
385 && (cset->nchar_classes || cset->non_match || cset->nranges
386 # ifdef _LIBC
387 || cset->nequiv_classes
388 # endif /* _LIBC */
389 ))
390 {
391 unsigned char c = 0;
392 do
393 {
394 mbstate_t mbs;
395 memset (&mbs, 0, sizeof (mbs));
396 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397 re_set_fastmap (fastmap, false, (int) c);
398 }
399 while (++c != 0);
400 }
401
402 else
403 {
404 /* ... Else catch all bytes which can start the mbchars. */
405 for (i = 0; i < cset->nmbchars; ++i)
406 {
407 char buf[256];
408 mbstate_t state;
409 memset (&state, '\0', sizeof (state));
410 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 {
414 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415 != (size_t) -1)
416 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
417 }
418 }
419 }
420 }
421 #endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
427 {
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
431 return;
432 }
433 }
434 }
435 \f
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
438
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
441
442 'buffer' to the compiled pattern;
443 'used' to the length of the compiled pattern;
444 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 'fastmap' to an allocated space for the fastmap;
449 'fastmap_accurate' to zero;
450 're_nsub' to the number of subexpressions in PATTERN.
451
452 PATTERN is the address of the pattern string.
453
454 CFLAGS is a series of bits which affect compilation.
455
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
458
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
461
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
464
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
467 registers.
468
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
471
472 int
473 regcomp (preg, pattern, cflags)
474 regex_t *_Restrict_ preg;
475 const char *_Restrict_ pattern;
476 int cflags;
477 {
478 reg_errcode_t ret;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
481
482 preg->buffer = NULL;
483 preg->allocated = 0;
484 preg->used = 0;
485
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
489 return REG_ESPACE;
490
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
500 }
501 else
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
505
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
511 ret = REG_EPAREN;
512
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
518 else
519 {
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
523 }
524
525 return (int) ret;
526 }
527 #ifdef _LIBC
528 weak_alias (__regcomp, regcomp)
529 #endif
530
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
533
534 #ifdef _LIBC
535 size_t
536 regerror (errcode, preg, errbuf, errbuf_size)
537 int errcode;
538 const regex_t *_Restrict_ preg;
539 char *_Restrict_ errbuf;
540 size_t errbuf_size;
541 #else /* size_t might promote */
542 size_t
543 regerror (int errcode, const regex_t *_Restrict_ preg,
544 char *_Restrict_ errbuf, size_t errbuf_size)
545 #endif
546 {
547 const char *msg;
548 size_t msg_size;
549
550 if (BE (errcode < 0
551 || errcode >= (int) (sizeof (__re_error_msgid_idx)
552 / sizeof (__re_error_msgid_idx[0])), 0))
553 /* Only error codes returned by the rest of the code should be passed
554 to this routine. If we are given anything else, or if other regex
555 code generates an invalid error code, then the program has a bug.
556 Dump core so we can fix it. */
557 abort ();
558
559 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560
561 msg_size = strlen (msg) + 1; /* Includes the null. */
562
563 if (BE (errbuf_size != 0, 1))
564 {
565 size_t cpy_size = msg_size;
566 if (BE (msg_size > errbuf_size, 0))
567 {
568 cpy_size = errbuf_size - 1;
569 errbuf[cpy_size] = '\0';
570 }
571 memcpy (errbuf, msg, cpy_size);
572 }
573
574 return msg_size;
575 }
576 #ifdef _LIBC
577 weak_alias (__regerror, regerror)
578 #endif
579
580
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586 static const bitset_t utf8_sb_map =
587 {
588 /* Set the first 128 bits. */
589 # ifdef __GNUC__
590 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591 # else
592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 # error "bitset_word_t is narrower than 32 bits"
594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX, BITSET_WORD_MAX,
598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
599 BITSET_WORD_MAX,
600 # endif
601 (BITSET_WORD_MAX
602 >> (SBC_MAX % BITSET_WORD_BITS == 0
603 ? 0
604 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
605 # endif
606 };
607 #endif
608
609
610 static void
611 free_dfa_content (re_dfa_t *dfa)
612 {
613 Idx i, j;
614
615 if (dfa->nodes)
616 for (i = 0; i < dfa->nodes_len; ++i)
617 free_token (dfa->nodes + i);
618 re_free (dfa->nexts);
619 for (i = 0; i < dfa->nodes_len; ++i)
620 {
621 if (dfa->eclosures != NULL)
622 re_node_set_free (dfa->eclosures + i);
623 if (dfa->inveclosures != NULL)
624 re_node_set_free (dfa->inveclosures + i);
625 if (dfa->edests != NULL)
626 re_node_set_free (dfa->edests + i);
627 }
628 re_free (dfa->edests);
629 re_free (dfa->eclosures);
630 re_free (dfa->inveclosures);
631 re_free (dfa->nodes);
632
633 if (dfa->state_table)
634 for (i = 0; i <= dfa->state_hash_mask; ++i)
635 {
636 struct re_state_table_entry *entry = dfa->state_table + i;
637 for (j = 0; j < entry->num; ++j)
638 {
639 re_dfastate_t *state = entry->array[j];
640 free_state (state);
641 }
642 re_free (entry->array);
643 }
644 re_free (dfa->state_table);
645 #ifdef RE_ENABLE_I18N
646 if (dfa->sb_char != utf8_sb_map)
647 re_free (dfa->sb_char);
648 #endif
649 re_free (dfa->subexp_map);
650 #ifdef DEBUG
651 re_free (dfa->re_str);
652 #endif
653
654 re_free (dfa);
655 }
656
657
658 /* Free dynamically allocated space used by PREG. */
659
660 void
661 regfree (preg)
662 regex_t *preg;
663 {
664 re_dfa_t *dfa = preg->buffer;
665 if (BE (dfa != NULL, 1))
666 free_dfa_content (dfa);
667 preg->buffer = NULL;
668 preg->allocated = 0;
669
670 re_free (preg->fastmap);
671 preg->fastmap = NULL;
672
673 re_free (preg->translate);
674 preg->translate = NULL;
675 }
676 #ifdef _LIBC
677 weak_alias (__regfree, regfree)
678 #endif
679 \f
680 /* Entry points compatible with 4.2 BSD regex library. We don't define
681 them unless specifically requested. */
682
683 #if defined _REGEX_RE_COMP || defined _LIBC
684
685 /* BSD has one and only one pattern buffer. */
686 static struct re_pattern_buffer re_comp_buf;
687
688 char *
689 # ifdef _LIBC
690 /* Make these definitions weak in libc, so POSIX programs can redefine
691 these names if they don't use our functions, and still use
692 regcomp/regexec above without link errors. */
693 weak_function
694 # endif
695 re_comp (s)
696 const char *s;
697 {
698 reg_errcode_t ret;
699 char *fastmap;
700
701 if (!s)
702 {
703 if (!re_comp_buf.buffer)
704 return gettext ("No previous regular expression");
705 return 0;
706 }
707
708 if (re_comp_buf.buffer)
709 {
710 fastmap = re_comp_buf.fastmap;
711 re_comp_buf.fastmap = NULL;
712 __regfree (&re_comp_buf);
713 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
714 re_comp_buf.fastmap = fastmap;
715 }
716
717 if (re_comp_buf.fastmap == NULL)
718 {
719 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
720 if (re_comp_buf.fastmap == NULL)
721 return (char *) gettext (__re_error_msgid
722 + __re_error_msgid_idx[(int) REG_ESPACE]);
723 }
724
725 /* Since 're_exec' always passes NULL for the 'regs' argument, we
726 don't need to initialize the pattern buffer fields which affect it. */
727
728 /* Match anchors at newlines. */
729 re_comp_buf.newline_anchor = 1;
730
731 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
732
733 if (!ret)
734 return NULL;
735
736 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
737 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
738 }
739
740 #ifdef _LIBC
741 libc_freeres_fn (free_mem)
742 {
743 __regfree (&re_comp_buf);
744 }
745 #endif
746
747 #endif /* _REGEX_RE_COMP */
748 \f
749 /* Internal entry point.
750 Compile the regular expression PATTERN, whose length is LENGTH.
751 SYNTAX indicate regular expression's syntax. */
752
753 static reg_errcode_t
754 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
755 reg_syntax_t syntax)
756 {
757 reg_errcode_t err = REG_NOERROR;
758 re_dfa_t *dfa;
759 re_string_t regexp;
760
761 /* Initialize the pattern buffer. */
762 preg->fastmap_accurate = 0;
763 preg->syntax = syntax;
764 preg->not_bol = preg->not_eol = 0;
765 preg->used = 0;
766 preg->re_nsub = 0;
767 preg->can_be_null = 0;
768 preg->regs_allocated = REGS_UNALLOCATED;
769
770 /* Initialize the dfa. */
771 dfa = preg->buffer;
772 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
773 {
774 /* If zero allocated, but buffer is non-null, try to realloc
775 enough space. This loses if buffer's address is bogus, but
776 that is the user's responsibility. If ->buffer is NULL this
777 is a simple allocation. */
778 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
779 if (dfa == NULL)
780 return REG_ESPACE;
781 preg->allocated = sizeof (re_dfa_t);
782 preg->buffer = dfa;
783 }
784 preg->used = sizeof (re_dfa_t);
785
786 err = init_dfa (dfa, length);
787 if (BE (err != REG_NOERROR, 0))
788 {
789 free_dfa_content (dfa);
790 preg->buffer = NULL;
791 preg->allocated = 0;
792 return err;
793 }
794 #ifdef DEBUG
795 /* Note: length+1 will not overflow since it is checked in init_dfa. */
796 dfa->re_str = re_malloc (char, length + 1);
797 strncpy (dfa->re_str, pattern, length + 1);
798 #endif
799
800 __libc_lock_init (dfa->lock);
801
802 err = re_string_construct (&regexp, pattern, length, preg->translate,
803 (syntax & RE_ICASE) != 0, dfa);
804 if (BE (err != REG_NOERROR, 0))
805 {
806 re_compile_internal_free_return:
807 free_workarea_compile (preg);
808 re_string_destruct (&regexp);
809 free_dfa_content (dfa);
810 preg->buffer = NULL;
811 preg->allocated = 0;
812 return err;
813 }
814
815 /* Parse the regular expression, and build a structure tree. */
816 preg->re_nsub = 0;
817 dfa->str_tree = parse (&regexp, preg, syntax, &err);
818 if (BE (dfa->str_tree == NULL, 0))
819 goto re_compile_internal_free_return;
820
821 /* Analyze the tree and create the nfa. */
822 err = analyze (preg);
823 if (BE (err != REG_NOERROR, 0))
824 goto re_compile_internal_free_return;
825
826 #ifdef RE_ENABLE_I18N
827 /* If possible, do searching in single byte encoding to speed things up. */
828 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
829 optimize_utf8 (dfa);
830 #endif
831
832 /* Then create the initial state of the dfa. */
833 err = create_initial_state (dfa);
834
835 /* Release work areas. */
836 free_workarea_compile (preg);
837 re_string_destruct (&regexp);
838
839 if (BE (err != REG_NOERROR, 0))
840 {
841 free_dfa_content (dfa);
842 preg->buffer = NULL;
843 preg->allocated = 0;
844 }
845
846 return err;
847 }
848
849 /* Initialize DFA. We use the length of the regular expression PAT_LEN
850 as the initial length of some arrays. */
851
852 static reg_errcode_t
853 init_dfa (re_dfa_t *dfa, size_t pat_len)
854 {
855 __re_size_t table_size;
856 #ifndef _LIBC
857 const char *codeset_name;
858 #endif
859 #ifdef RE_ENABLE_I18N
860 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
861 #else
862 size_t max_i18n_object_size = 0;
863 #endif
864 size_t max_object_size =
865 MAX (sizeof (struct re_state_table_entry),
866 MAX (sizeof (re_token_t),
867 MAX (sizeof (re_node_set),
868 MAX (sizeof (regmatch_t),
869 max_i18n_object_size))));
870
871 memset (dfa, '\0', sizeof (re_dfa_t));
872
873 /* Force allocation of str_tree_storage the first time. */
874 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
875
876 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
877 calculation below, and for similar doubling calculations
878 elsewhere. And it's <= rather than <, because some of the
879 doubling calculations add 1 afterwards. */
880 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
881 return REG_ESPACE;
882
883 dfa->nodes_alloc = pat_len + 1;
884 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
885
886 /* table_size = 2 ^ ceil(log pat_len) */
887 for (table_size = 1; ; table_size <<= 1)
888 if (table_size > pat_len)
889 break;
890
891 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
892 dfa->state_hash_mask = table_size - 1;
893
894 dfa->mb_cur_max = MB_CUR_MAX;
895 #ifdef _LIBC
896 if (dfa->mb_cur_max == 6
897 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
898 dfa->is_utf8 = 1;
899 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
900 != 0);
901 #else
902 codeset_name = nl_langinfo (CODESET);
903 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
904 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
905 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
906 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
907 dfa->is_utf8 = 1;
908
909 /* We check exhaustively in the loop below if this charset is a
910 superset of ASCII. */
911 dfa->map_notascii = 0;
912 #endif
913
914 #ifdef RE_ENABLE_I18N
915 if (dfa->mb_cur_max > 1)
916 {
917 if (dfa->is_utf8)
918 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
919 else
920 {
921 int i, j, ch;
922
923 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
924 if (BE (dfa->sb_char == NULL, 0))
925 return REG_ESPACE;
926
927 /* Set the bits corresponding to single byte chars. */
928 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
929 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
930 {
931 wint_t wch = __btowc (ch);
932 if (wch != WEOF)
933 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
934 # ifndef _LIBC
935 if (isascii (ch) && wch != ch)
936 dfa->map_notascii = 1;
937 # endif
938 }
939 }
940 }
941 #endif
942
943 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
944 return REG_ESPACE;
945 return REG_NOERROR;
946 }
947
948 /* Initialize WORD_CHAR table, which indicate which character is
949 "word". In this case "word" means that it is the word construction
950 character used by some operators like "\<", "\>", etc. */
951
952 static void
953 internal_function
954 init_word_char (re_dfa_t *dfa)
955 {
956 int i = 0;
957 int j;
958 int ch = 0;
959 dfa->word_ops_used = 1;
960 if (BE (dfa->map_notascii == 0, 1))
961 {
962 bitset_word_t bits0 = 0x00000000;
963 bitset_word_t bits1 = 0x03ff0000;
964 bitset_word_t bits2 = 0x87fffffe;
965 bitset_word_t bits3 = 0x07fffffe;
966 if (BITSET_WORD_BITS == 64)
967 {
968 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
969 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
970 i = 2;
971 }
972 else if (BITSET_WORD_BITS == 32)
973 {
974 dfa->word_char[0] = bits0;
975 dfa->word_char[1] = bits1;
976 dfa->word_char[2] = bits2;
977 dfa->word_char[3] = bits3;
978 i = 4;
979 }
980 else
981 goto general_case;
982 ch = 128;
983
984 if (BE (dfa->is_utf8, 1))
985 {
986 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
987 return;
988 }
989 }
990
991 general_case:
992 for (; i < BITSET_WORDS; ++i)
993 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
994 if (isalnum (ch) || ch == '_')
995 dfa->word_char[i] |= (bitset_word_t) 1 << j;
996 }
997
998 /* Free the work area which are only used while compiling. */
999
1000 static void
1001 free_workarea_compile (regex_t *preg)
1002 {
1003 re_dfa_t *dfa = preg->buffer;
1004 bin_tree_storage_t *storage, *next;
1005 for (storage = dfa->str_tree_storage; storage; storage = next)
1006 {
1007 next = storage->next;
1008 re_free (storage);
1009 }
1010 dfa->str_tree_storage = NULL;
1011 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1012 dfa->str_tree = NULL;
1013 re_free (dfa->org_indices);
1014 dfa->org_indices = NULL;
1015 }
1016
1017 /* Create initial states for all contexts. */
1018
1019 static reg_errcode_t
1020 create_initial_state (re_dfa_t *dfa)
1021 {
1022 Idx first, i;
1023 reg_errcode_t err;
1024 re_node_set init_nodes;
1025
1026 /* Initial states have the epsilon closure of the node which is
1027 the first node of the regular expression. */
1028 first = dfa->str_tree->first->node_idx;
1029 dfa->init_node = first;
1030 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1031 if (BE (err != REG_NOERROR, 0))
1032 return err;
1033
1034 /* The back-references which are in initial states can epsilon transit,
1035 since in this case all of the subexpressions can be null.
1036 Then we add epsilon closures of the nodes which are the next nodes of
1037 the back-references. */
1038 if (dfa->nbackref > 0)
1039 for (i = 0; i < init_nodes.nelem; ++i)
1040 {
1041 Idx node_idx = init_nodes.elems[i];
1042 re_token_type_t type = dfa->nodes[node_idx].type;
1043
1044 Idx clexp_idx;
1045 if (type != OP_BACK_REF)
1046 continue;
1047 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1048 {
1049 re_token_t *clexp_node;
1050 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1051 if (clexp_node->type == OP_CLOSE_SUBEXP
1052 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1053 break;
1054 }
1055 if (clexp_idx == init_nodes.nelem)
1056 continue;
1057
1058 if (type == OP_BACK_REF)
1059 {
1060 Idx dest_idx = dfa->edests[node_idx].elems[0];
1061 if (!re_node_set_contains (&init_nodes, dest_idx))
1062 {
1063 reg_errcode_t merge_err
1064 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1065 if (merge_err != REG_NOERROR)
1066 return merge_err;
1067 i = 0;
1068 }
1069 }
1070 }
1071
1072 /* It must be the first time to invoke acquire_state. */
1073 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1074 /* We don't check ERR here, since the initial state must not be NULL. */
1075 if (BE (dfa->init_state == NULL, 0))
1076 return err;
1077 if (dfa->init_state->has_constraint)
1078 {
1079 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1080 CONTEXT_WORD);
1081 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1082 CONTEXT_NEWLINE);
1083 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1084 &init_nodes,
1085 CONTEXT_NEWLINE
1086 | CONTEXT_BEGBUF);
1087 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1088 || dfa->init_state_begbuf == NULL, 0))
1089 return err;
1090 }
1091 else
1092 dfa->init_state_word = dfa->init_state_nl
1093 = dfa->init_state_begbuf = dfa->init_state;
1094
1095 re_node_set_free (&init_nodes);
1096 return REG_NOERROR;
1097 }
1098 \f
1099 #ifdef RE_ENABLE_I18N
1100 /* If it is possible to do searching in single byte encoding instead of UTF-8
1101 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1102 DFA nodes where needed. */
1103
1104 static void
1105 optimize_utf8 (re_dfa_t *dfa)
1106 {
1107 Idx node;
1108 int i;
1109 bool mb_chars = false;
1110 bool has_period = false;
1111
1112 for (node = 0; node < dfa->nodes_len; ++node)
1113 switch (dfa->nodes[node].type)
1114 {
1115 case CHARACTER:
1116 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1117 mb_chars = true;
1118 break;
1119 case ANCHOR:
1120 switch (dfa->nodes[node].opr.ctx_type)
1121 {
1122 case LINE_FIRST:
1123 case LINE_LAST:
1124 case BUF_FIRST:
1125 case BUF_LAST:
1126 break;
1127 default:
1128 /* Word anchors etc. cannot be handled. It's okay to test
1129 opr.ctx_type since constraints (for all DFA nodes) are
1130 created by ORing one or more opr.ctx_type values. */
1131 return;
1132 }
1133 break;
1134 case OP_PERIOD:
1135 has_period = true;
1136 break;
1137 case OP_BACK_REF:
1138 case OP_ALT:
1139 case END_OF_RE:
1140 case OP_DUP_ASTERISK:
1141 case OP_OPEN_SUBEXP:
1142 case OP_CLOSE_SUBEXP:
1143 break;
1144 case COMPLEX_BRACKET:
1145 return;
1146 case SIMPLE_BRACKET:
1147 /* Just double check. */
1148 {
1149 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1150 ? 0
1151 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1152 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1153 {
1154 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1155 return;
1156 rshift = 0;
1157 }
1158 }
1159 break;
1160 default:
1161 abort ();
1162 }
1163
1164 if (mb_chars || has_period)
1165 for (node = 0; node < dfa->nodes_len; ++node)
1166 {
1167 if (dfa->nodes[node].type == CHARACTER
1168 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1169 dfa->nodes[node].mb_partial = 0;
1170 else if (dfa->nodes[node].type == OP_PERIOD)
1171 dfa->nodes[node].type = OP_UTF8_PERIOD;
1172 }
1173
1174 /* The search can be in single byte locale. */
1175 dfa->mb_cur_max = 1;
1176 dfa->is_utf8 = 0;
1177 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1178 }
1179 #endif
1180 \f
1181 /* Analyze the structure tree, and calculate "first", "next", "edest",
1182 "eclosure", and "inveclosure". */
1183
1184 static reg_errcode_t
1185 analyze (regex_t *preg)
1186 {
1187 re_dfa_t *dfa = preg->buffer;
1188 reg_errcode_t ret;
1189
1190 /* Allocate arrays. */
1191 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1192 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1193 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1194 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1195 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1196 || dfa->eclosures == NULL, 0))
1197 return REG_ESPACE;
1198
1199 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1200 if (dfa->subexp_map != NULL)
1201 {
1202 Idx i;
1203 for (i = 0; i < preg->re_nsub; i++)
1204 dfa->subexp_map[i] = i;
1205 preorder (dfa->str_tree, optimize_subexps, dfa);
1206 for (i = 0; i < preg->re_nsub; i++)
1207 if (dfa->subexp_map[i] != i)
1208 break;
1209 if (i == preg->re_nsub)
1210 {
1211 free (dfa->subexp_map);
1212 dfa->subexp_map = NULL;
1213 }
1214 }
1215
1216 ret = postorder (dfa->str_tree, lower_subexps, preg);
1217 if (BE (ret != REG_NOERROR, 0))
1218 return ret;
1219 ret = postorder (dfa->str_tree, calc_first, dfa);
1220 if (BE (ret != REG_NOERROR, 0))
1221 return ret;
1222 preorder (dfa->str_tree, calc_next, dfa);
1223 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1224 if (BE (ret != REG_NOERROR, 0))
1225 return ret;
1226 ret = calc_eclosure (dfa);
1227 if (BE (ret != REG_NOERROR, 0))
1228 return ret;
1229
1230 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1231 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1232 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1233 || dfa->nbackref)
1234 {
1235 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1236 if (BE (dfa->inveclosures == NULL, 0))
1237 return REG_ESPACE;
1238 ret = calc_inveclosure (dfa);
1239 }
1240
1241 return ret;
1242 }
1243
1244 /* Our parse trees are very unbalanced, so we cannot use a stack to
1245 implement parse tree visits. Instead, we use parent pointers and
1246 some hairy code in these two functions. */
1247 static reg_errcode_t
1248 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1249 void *extra)
1250 {
1251 bin_tree_t *node, *prev;
1252
1253 for (node = root; ; )
1254 {
1255 /* Descend down the tree, preferably to the left (or to the right
1256 if that's the only child). */
1257 while (node->left || node->right)
1258 if (node->left)
1259 node = node->left;
1260 else
1261 node = node->right;
1262
1263 do
1264 {
1265 reg_errcode_t err = fn (extra, node);
1266 if (BE (err != REG_NOERROR, 0))
1267 return err;
1268 if (node->parent == NULL)
1269 return REG_NOERROR;
1270 prev = node;
1271 node = node->parent;
1272 }
1273 /* Go up while we have a node that is reached from the right. */
1274 while (node->right == prev || node->right == NULL);
1275 node = node->right;
1276 }
1277 }
1278
1279 static reg_errcode_t
1280 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1281 void *extra)
1282 {
1283 bin_tree_t *node;
1284
1285 for (node = root; ; )
1286 {
1287 reg_errcode_t err = fn (extra, node);
1288 if (BE (err != REG_NOERROR, 0))
1289 return err;
1290
1291 /* Go to the left node, or up and to the right. */
1292 if (node->left)
1293 node = node->left;
1294 else
1295 {
1296 bin_tree_t *prev = NULL;
1297 while (node->right == prev || node->right == NULL)
1298 {
1299 prev = node;
1300 node = node->parent;
1301 if (!node)
1302 return REG_NOERROR;
1303 }
1304 node = node->right;
1305 }
1306 }
1307 }
1308
1309 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1310 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1311 backreferences as well. Requires a preorder visit. */
1312 static reg_errcode_t
1313 optimize_subexps (void *extra, bin_tree_t *node)
1314 {
1315 re_dfa_t *dfa = (re_dfa_t *) extra;
1316
1317 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1318 {
1319 int idx = node->token.opr.idx;
1320 node->token.opr.idx = dfa->subexp_map[idx];
1321 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1322 }
1323
1324 else if (node->token.type == SUBEXP
1325 && node->left && node->left->token.type == SUBEXP)
1326 {
1327 Idx other_idx = node->left->token.opr.idx;
1328
1329 node->left = node->left->left;
1330 if (node->left)
1331 node->left->parent = node;
1332
1333 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1334 if (other_idx < BITSET_WORD_BITS)
1335 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1336 }
1337
1338 return REG_NOERROR;
1339 }
1340
1341 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1342 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1343 static reg_errcode_t
1344 lower_subexps (void *extra, bin_tree_t *node)
1345 {
1346 regex_t *preg = (regex_t *) extra;
1347 reg_errcode_t err = REG_NOERROR;
1348
1349 if (node->left && node->left->token.type == SUBEXP)
1350 {
1351 node->left = lower_subexp (&err, preg, node->left);
1352 if (node->left)
1353 node->left->parent = node;
1354 }
1355 if (node->right && node->right->token.type == SUBEXP)
1356 {
1357 node->right = lower_subexp (&err, preg, node->right);
1358 if (node->right)
1359 node->right->parent = node;
1360 }
1361
1362 return err;
1363 }
1364
1365 static bin_tree_t *
1366 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1367 {
1368 re_dfa_t *dfa = preg->buffer;
1369 bin_tree_t *body = node->left;
1370 bin_tree_t *op, *cls, *tree1, *tree;
1371
1372 if (preg->no_sub
1373 /* We do not optimize empty subexpressions, because otherwise we may
1374 have bad CONCAT nodes with NULL children. This is obviously not
1375 very common, so we do not lose much. An example that triggers
1376 this case is the sed "script" /\(\)/x. */
1377 && node->left != NULL
1378 && (node->token.opr.idx >= BITSET_WORD_BITS
1379 || !(dfa->used_bkref_map
1380 & ((bitset_word_t) 1 << node->token.opr.idx))))
1381 return node->left;
1382
1383 /* Convert the SUBEXP node to the concatenation of an
1384 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1385 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1386 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1387 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1388 tree = create_tree (dfa, op, tree1, CONCAT);
1389 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1390 {
1391 *err = REG_ESPACE;
1392 return NULL;
1393 }
1394
1395 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1396 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1397 return tree;
1398 }
1399
1400 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1401 nodes. Requires a postorder visit. */
1402 static reg_errcode_t
1403 calc_first (void *extra, bin_tree_t *node)
1404 {
1405 re_dfa_t *dfa = (re_dfa_t *) extra;
1406 if (node->token.type == CONCAT)
1407 {
1408 node->first = node->left->first;
1409 node->node_idx = node->left->node_idx;
1410 }
1411 else
1412 {
1413 node->first = node;
1414 node->node_idx = re_dfa_add_node (dfa, node->token);
1415 if (BE (node->node_idx == REG_MISSING, 0))
1416 return REG_ESPACE;
1417 if (node->token.type == ANCHOR)
1418 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1419 }
1420 return REG_NOERROR;
1421 }
1422
1423 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1424 static reg_errcode_t
1425 calc_next (void *extra, bin_tree_t *node)
1426 {
1427 switch (node->token.type)
1428 {
1429 case OP_DUP_ASTERISK:
1430 node->left->next = node;
1431 break;
1432 case CONCAT:
1433 node->left->next = node->right->first;
1434 node->right->next = node->next;
1435 break;
1436 default:
1437 if (node->left)
1438 node->left->next = node->next;
1439 if (node->right)
1440 node->right->next = node->next;
1441 break;
1442 }
1443 return REG_NOERROR;
1444 }
1445
1446 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1447 static reg_errcode_t
1448 link_nfa_nodes (void *extra, bin_tree_t *node)
1449 {
1450 re_dfa_t *dfa = (re_dfa_t *) extra;
1451 Idx idx = node->node_idx;
1452 reg_errcode_t err = REG_NOERROR;
1453
1454 switch (node->token.type)
1455 {
1456 case CONCAT:
1457 break;
1458
1459 case END_OF_RE:
1460 assert (node->next == NULL);
1461 break;
1462
1463 case OP_DUP_ASTERISK:
1464 case OP_ALT:
1465 {
1466 Idx left, right;
1467 dfa->has_plural_match = 1;
1468 if (node->left != NULL)
1469 left = node->left->first->node_idx;
1470 else
1471 left = node->next->node_idx;
1472 if (node->right != NULL)
1473 right = node->right->first->node_idx;
1474 else
1475 right = node->next->node_idx;
1476 assert (REG_VALID_INDEX (left));
1477 assert (REG_VALID_INDEX (right));
1478 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1479 }
1480 break;
1481
1482 case ANCHOR:
1483 case OP_OPEN_SUBEXP:
1484 case OP_CLOSE_SUBEXP:
1485 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1486 break;
1487
1488 case OP_BACK_REF:
1489 dfa->nexts[idx] = node->next->node_idx;
1490 if (node->token.type == OP_BACK_REF)
1491 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1492 break;
1493
1494 default:
1495 assert (!IS_EPSILON_NODE (node->token.type));
1496 dfa->nexts[idx] = node->next->node_idx;
1497 break;
1498 }
1499
1500 return err;
1501 }
1502
1503 /* Duplicate the epsilon closure of the node ROOT_NODE.
1504 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1505 to their own constraint. */
1506
1507 static reg_errcode_t
1508 internal_function
1509 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1510 Idx root_node, unsigned int init_constraint)
1511 {
1512 Idx org_node, clone_node;
1513 bool ok;
1514 unsigned int constraint = init_constraint;
1515 for (org_node = top_org_node, clone_node = top_clone_node;;)
1516 {
1517 Idx org_dest, clone_dest;
1518 if (dfa->nodes[org_node].type == OP_BACK_REF)
1519 {
1520 /* If the back reference epsilon-transit, its destination must
1521 also have the constraint. Then duplicate the epsilon closure
1522 of the destination of the back reference, and store it in
1523 edests of the back reference. */
1524 org_dest = dfa->nexts[org_node];
1525 re_node_set_empty (dfa->edests + clone_node);
1526 clone_dest = duplicate_node (dfa, org_dest, constraint);
1527 if (BE (clone_dest == REG_MISSING, 0))
1528 return REG_ESPACE;
1529 dfa->nexts[clone_node] = dfa->nexts[org_node];
1530 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1531 if (BE (! ok, 0))
1532 return REG_ESPACE;
1533 }
1534 else if (dfa->edests[org_node].nelem == 0)
1535 {
1536 /* In case of the node can't epsilon-transit, don't duplicate the
1537 destination and store the original destination as the
1538 destination of the node. */
1539 dfa->nexts[clone_node] = dfa->nexts[org_node];
1540 break;
1541 }
1542 else if (dfa->edests[org_node].nelem == 1)
1543 {
1544 /* In case of the node can epsilon-transit, and it has only one
1545 destination. */
1546 org_dest = dfa->edests[org_node].elems[0];
1547 re_node_set_empty (dfa->edests + clone_node);
1548 /* If the node is root_node itself, it means the epsilon closure
1549 has a loop. Then tie it to the destination of the root_node. */
1550 if (org_node == root_node && clone_node != org_node)
1551 {
1552 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1553 if (BE (! ok, 0))
1554 return REG_ESPACE;
1555 break;
1556 }
1557 /* In case the node has another constraint, append it. */
1558 constraint |= dfa->nodes[org_node].constraint;
1559 clone_dest = duplicate_node (dfa, org_dest, constraint);
1560 if (BE (clone_dest == REG_MISSING, 0))
1561 return REG_ESPACE;
1562 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563 if (BE (! ok, 0))
1564 return REG_ESPACE;
1565 }
1566 else /* dfa->edests[org_node].nelem == 2 */
1567 {
1568 /* In case of the node can epsilon-transit, and it has two
1569 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1570 org_dest = dfa->edests[org_node].elems[0];
1571 re_node_set_empty (dfa->edests + clone_node);
1572 /* Search for a duplicated node which satisfies the constraint. */
1573 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1574 if (clone_dest == REG_MISSING)
1575 {
1576 /* There is no such duplicated node, create a new one. */
1577 reg_errcode_t err;
1578 clone_dest = duplicate_node (dfa, org_dest, constraint);
1579 if (BE (clone_dest == REG_MISSING, 0))
1580 return REG_ESPACE;
1581 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1582 if (BE (! ok, 0))
1583 return REG_ESPACE;
1584 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1585 root_node, constraint);
1586 if (BE (err != REG_NOERROR, 0))
1587 return err;
1588 }
1589 else
1590 {
1591 /* There is a duplicated node which satisfies the constraint,
1592 use it to avoid infinite loop. */
1593 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1594 if (BE (! ok, 0))
1595 return REG_ESPACE;
1596 }
1597
1598 org_dest = dfa->edests[org_node].elems[1];
1599 clone_dest = duplicate_node (dfa, org_dest, constraint);
1600 if (BE (clone_dest == REG_MISSING, 0))
1601 return REG_ESPACE;
1602 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1603 if (BE (! ok, 0))
1604 return REG_ESPACE;
1605 }
1606 org_node = org_dest;
1607 clone_node = clone_dest;
1608 }
1609 return REG_NOERROR;
1610 }
1611
1612 /* Search for a node which is duplicated from the node ORG_NODE, and
1613 satisfies the constraint CONSTRAINT. */
1614
1615 static Idx
1616 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1617 unsigned int constraint)
1618 {
1619 Idx idx;
1620 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1621 {
1622 if (org_node == dfa->org_indices[idx]
1623 && constraint == dfa->nodes[idx].constraint)
1624 return idx; /* Found. */
1625 }
1626 return REG_MISSING; /* Not found. */
1627 }
1628
1629 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1630 Return the index of the new node, or REG_MISSING if insufficient storage is
1631 available. */
1632
1633 static Idx
1634 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1635 {
1636 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1637 if (BE (dup_idx != REG_MISSING, 1))
1638 {
1639 dfa->nodes[dup_idx].constraint = constraint;
1640 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1641 dfa->nodes[dup_idx].duplicated = 1;
1642
1643 /* Store the index of the original node. */
1644 dfa->org_indices[dup_idx] = org_idx;
1645 }
1646 return dup_idx;
1647 }
1648
1649 static reg_errcode_t
1650 calc_inveclosure (re_dfa_t *dfa)
1651 {
1652 Idx src, idx;
1653 bool ok;
1654 for (idx = 0; idx < dfa->nodes_len; ++idx)
1655 re_node_set_init_empty (dfa->inveclosures + idx);
1656
1657 for (src = 0; src < dfa->nodes_len; ++src)
1658 {
1659 Idx *elems = dfa->eclosures[src].elems;
1660 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1661 {
1662 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1663 if (BE (! ok, 0))
1664 return REG_ESPACE;
1665 }
1666 }
1667
1668 return REG_NOERROR;
1669 }
1670
1671 /* Calculate "eclosure" for all the node in DFA. */
1672
1673 static reg_errcode_t
1674 calc_eclosure (re_dfa_t *dfa)
1675 {
1676 Idx node_idx;
1677 bool incomplete;
1678 #ifdef DEBUG
1679 assert (dfa->nodes_len > 0);
1680 #endif
1681 incomplete = false;
1682 /* For each nodes, calculate epsilon closure. */
1683 for (node_idx = 0; ; ++node_idx)
1684 {
1685 reg_errcode_t err;
1686 re_node_set eclosure_elem;
1687 if (node_idx == dfa->nodes_len)
1688 {
1689 if (!incomplete)
1690 break;
1691 incomplete = false;
1692 node_idx = 0;
1693 }
1694
1695 #ifdef DEBUG
1696 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1697 #endif
1698
1699 /* If we have already calculated, skip it. */
1700 if (dfa->eclosures[node_idx].nelem != 0)
1701 continue;
1702 /* Calculate epsilon closure of 'node_idx'. */
1703 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1704 if (BE (err != REG_NOERROR, 0))
1705 return err;
1706
1707 if (dfa->eclosures[node_idx].nelem == 0)
1708 {
1709 incomplete = true;
1710 re_node_set_free (&eclosure_elem);
1711 }
1712 }
1713 return REG_NOERROR;
1714 }
1715
1716 /* Calculate epsilon closure of NODE. */
1717
1718 static reg_errcode_t
1719 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1720 {
1721 reg_errcode_t err;
1722 Idx i;
1723 re_node_set eclosure;
1724 bool ok;
1725 bool incomplete = false;
1726 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1727 if (BE (err != REG_NOERROR, 0))
1728 return err;
1729
1730 /* This indicates that we are calculating this node now.
1731 We reference this value to avoid infinite loop. */
1732 dfa->eclosures[node].nelem = REG_MISSING;
1733
1734 /* If the current node has constraints, duplicate all nodes
1735 since they must inherit the constraints. */
1736 if (dfa->nodes[node].constraint
1737 && dfa->edests[node].nelem
1738 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1739 {
1740 err = duplicate_node_closure (dfa, node, node, node,
1741 dfa->nodes[node].constraint);
1742 if (BE (err != REG_NOERROR, 0))
1743 return err;
1744 }
1745
1746 /* Expand each epsilon destination nodes. */
1747 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1748 for (i = 0; i < dfa->edests[node].nelem; ++i)
1749 {
1750 re_node_set eclosure_elem;
1751 Idx edest = dfa->edests[node].elems[i];
1752 /* If calculating the epsilon closure of 'edest' is in progress,
1753 return intermediate result. */
1754 if (dfa->eclosures[edest].nelem == REG_MISSING)
1755 {
1756 incomplete = true;
1757 continue;
1758 }
1759 /* If we haven't calculated the epsilon closure of 'edest' yet,
1760 calculate now. Otherwise use calculated epsilon closure. */
1761 if (dfa->eclosures[edest].nelem == 0)
1762 {
1763 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1764 if (BE (err != REG_NOERROR, 0))
1765 return err;
1766 }
1767 else
1768 eclosure_elem = dfa->eclosures[edest];
1769 /* Merge the epsilon closure of 'edest'. */
1770 err = re_node_set_merge (&eclosure, &eclosure_elem);
1771 if (BE (err != REG_NOERROR, 0))
1772 return err;
1773 /* If the epsilon closure of 'edest' is incomplete,
1774 the epsilon closure of this node is also incomplete. */
1775 if (dfa->eclosures[edest].nelem == 0)
1776 {
1777 incomplete = true;
1778 re_node_set_free (&eclosure_elem);
1779 }
1780 }
1781
1782 /* An epsilon closure includes itself. */
1783 ok = re_node_set_insert (&eclosure, node);
1784 if (BE (! ok, 0))
1785 return REG_ESPACE;
1786 if (incomplete && !root)
1787 dfa->eclosures[node].nelem = 0;
1788 else
1789 dfa->eclosures[node] = eclosure;
1790 *new_set = eclosure;
1791 return REG_NOERROR;
1792 }
1793 \f
1794 /* Functions for token which are used in the parser. */
1795
1796 /* Fetch a token from INPUT.
1797 We must not use this function inside bracket expressions. */
1798
1799 static void
1800 internal_function
1801 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1802 {
1803 re_string_skip_bytes (input, peek_token (result, input, syntax));
1804 }
1805
1806 /* Peek a token from INPUT, and return the length of the token.
1807 We must not use this function inside bracket expressions. */
1808
1809 static int
1810 internal_function
1811 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1812 {
1813 unsigned char c;
1814
1815 if (re_string_eoi (input))
1816 {
1817 token->type = END_OF_RE;
1818 return 0;
1819 }
1820
1821 c = re_string_peek_byte (input, 0);
1822 token->opr.c = c;
1823
1824 token->word_char = 0;
1825 #ifdef RE_ENABLE_I18N
1826 token->mb_partial = 0;
1827 if (input->mb_cur_max > 1 &&
1828 !re_string_first_byte (input, re_string_cur_idx (input)))
1829 {
1830 token->type = CHARACTER;
1831 token->mb_partial = 1;
1832 return 1;
1833 }
1834 #endif
1835 if (c == '\\')
1836 {
1837 unsigned char c2;
1838 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1839 {
1840 token->type = BACK_SLASH;
1841 return 1;
1842 }
1843
1844 c2 = re_string_peek_byte_case (input, 1);
1845 token->opr.c = c2;
1846 token->type = CHARACTER;
1847 #ifdef RE_ENABLE_I18N
1848 if (input->mb_cur_max > 1)
1849 {
1850 wint_t wc = re_string_wchar_at (input,
1851 re_string_cur_idx (input) + 1);
1852 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1853 }
1854 else
1855 #endif
1856 token->word_char = IS_WORD_CHAR (c2) != 0;
1857
1858 switch (c2)
1859 {
1860 case '|':
1861 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1862 token->type = OP_ALT;
1863 break;
1864 case '1': case '2': case '3': case '4': case '5':
1865 case '6': case '7': case '8': case '9':
1866 if (!(syntax & RE_NO_BK_REFS))
1867 {
1868 token->type = OP_BACK_REF;
1869 token->opr.idx = c2 - '1';
1870 }
1871 break;
1872 case '<':
1873 if (!(syntax & RE_NO_GNU_OPS))
1874 {
1875 token->type = ANCHOR;
1876 token->opr.ctx_type = WORD_FIRST;
1877 }
1878 break;
1879 case '>':
1880 if (!(syntax & RE_NO_GNU_OPS))
1881 {
1882 token->type = ANCHOR;
1883 token->opr.ctx_type = WORD_LAST;
1884 }
1885 break;
1886 case 'b':
1887 if (!(syntax & RE_NO_GNU_OPS))
1888 {
1889 token->type = ANCHOR;
1890 token->opr.ctx_type = WORD_DELIM;
1891 }
1892 break;
1893 case 'B':
1894 if (!(syntax & RE_NO_GNU_OPS))
1895 {
1896 token->type = ANCHOR;
1897 token->opr.ctx_type = NOT_WORD_DELIM;
1898 }
1899 break;
1900 case 'w':
1901 if (!(syntax & RE_NO_GNU_OPS))
1902 token->type = OP_WORD;
1903 break;
1904 case 'W':
1905 if (!(syntax & RE_NO_GNU_OPS))
1906 token->type = OP_NOTWORD;
1907 break;
1908 case 's':
1909 if (!(syntax & RE_NO_GNU_OPS))
1910 token->type = OP_SPACE;
1911 break;
1912 case 'S':
1913 if (!(syntax & RE_NO_GNU_OPS))
1914 token->type = OP_NOTSPACE;
1915 break;
1916 case '`':
1917 if (!(syntax & RE_NO_GNU_OPS))
1918 {
1919 token->type = ANCHOR;
1920 token->opr.ctx_type = BUF_FIRST;
1921 }
1922 break;
1923 case '\'':
1924 if (!(syntax & RE_NO_GNU_OPS))
1925 {
1926 token->type = ANCHOR;
1927 token->opr.ctx_type = BUF_LAST;
1928 }
1929 break;
1930 case '(':
1931 if (!(syntax & RE_NO_BK_PARENS))
1932 token->type = OP_OPEN_SUBEXP;
1933 break;
1934 case ')':
1935 if (!(syntax & RE_NO_BK_PARENS))
1936 token->type = OP_CLOSE_SUBEXP;
1937 break;
1938 case '+':
1939 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1940 token->type = OP_DUP_PLUS;
1941 break;
1942 case '?':
1943 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1944 token->type = OP_DUP_QUESTION;
1945 break;
1946 case '{':
1947 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1948 token->type = OP_OPEN_DUP_NUM;
1949 break;
1950 case '}':
1951 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1952 token->type = OP_CLOSE_DUP_NUM;
1953 break;
1954 default:
1955 break;
1956 }
1957 return 2;
1958 }
1959
1960 token->type = CHARACTER;
1961 #ifdef RE_ENABLE_I18N
1962 if (input->mb_cur_max > 1)
1963 {
1964 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1965 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1966 }
1967 else
1968 #endif
1969 token->word_char = IS_WORD_CHAR (token->opr.c);
1970
1971 switch (c)
1972 {
1973 case '\n':
1974 if (syntax & RE_NEWLINE_ALT)
1975 token->type = OP_ALT;
1976 break;
1977 case '|':
1978 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1979 token->type = OP_ALT;
1980 break;
1981 case '*':
1982 token->type = OP_DUP_ASTERISK;
1983 break;
1984 case '+':
1985 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1986 token->type = OP_DUP_PLUS;
1987 break;
1988 case '?':
1989 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1990 token->type = OP_DUP_QUESTION;
1991 break;
1992 case '{':
1993 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1994 token->type = OP_OPEN_DUP_NUM;
1995 break;
1996 case '}':
1997 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1998 token->type = OP_CLOSE_DUP_NUM;
1999 break;
2000 case '(':
2001 if (syntax & RE_NO_BK_PARENS)
2002 token->type = OP_OPEN_SUBEXP;
2003 break;
2004 case ')':
2005 if (syntax & RE_NO_BK_PARENS)
2006 token->type = OP_CLOSE_SUBEXP;
2007 break;
2008 case '[':
2009 token->type = OP_OPEN_BRACKET;
2010 break;
2011 case '.':
2012 token->type = OP_PERIOD;
2013 break;
2014 case '^':
2015 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2016 re_string_cur_idx (input) != 0)
2017 {
2018 char prev = re_string_peek_byte (input, -1);
2019 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2020 break;
2021 }
2022 token->type = ANCHOR;
2023 token->opr.ctx_type = LINE_FIRST;
2024 break;
2025 case '$':
2026 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2027 re_string_cur_idx (input) + 1 != re_string_length (input))
2028 {
2029 re_token_t next;
2030 re_string_skip_bytes (input, 1);
2031 peek_token (&next, input, syntax);
2032 re_string_skip_bytes (input, -1);
2033 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2034 break;
2035 }
2036 token->type = ANCHOR;
2037 token->opr.ctx_type = LINE_LAST;
2038 break;
2039 default:
2040 break;
2041 }
2042 return 1;
2043 }
2044
2045 /* Peek a token from INPUT, and return the length of the token.
2046 We must not use this function out of bracket expressions. */
2047
2048 static int
2049 internal_function
2050 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2051 {
2052 unsigned char c;
2053 if (re_string_eoi (input))
2054 {
2055 token->type = END_OF_RE;
2056 return 0;
2057 }
2058 c = re_string_peek_byte (input, 0);
2059 token->opr.c = c;
2060
2061 #ifdef RE_ENABLE_I18N
2062 if (input->mb_cur_max > 1 &&
2063 !re_string_first_byte (input, re_string_cur_idx (input)))
2064 {
2065 token->type = CHARACTER;
2066 return 1;
2067 }
2068 #endif /* RE_ENABLE_I18N */
2069
2070 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2071 && re_string_cur_idx (input) + 1 < re_string_length (input))
2072 {
2073 /* In this case, '\' escape a character. */
2074 unsigned char c2;
2075 re_string_skip_bytes (input, 1);
2076 c2 = re_string_peek_byte (input, 0);
2077 token->opr.c = c2;
2078 token->type = CHARACTER;
2079 return 1;
2080 }
2081 if (c == '[') /* '[' is a special char in a bracket exps. */
2082 {
2083 unsigned char c2;
2084 int token_len;
2085 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2086 c2 = re_string_peek_byte (input, 1);
2087 else
2088 c2 = 0;
2089 token->opr.c = c2;
2090 token_len = 2;
2091 switch (c2)
2092 {
2093 case '.':
2094 token->type = OP_OPEN_COLL_ELEM;
2095 break;
2096 case '=':
2097 token->type = OP_OPEN_EQUIV_CLASS;
2098 break;
2099 case ':':
2100 if (syntax & RE_CHAR_CLASSES)
2101 {
2102 token->type = OP_OPEN_CHAR_CLASS;
2103 break;
2104 }
2105 /* else fall through. */
2106 default:
2107 token->type = CHARACTER;
2108 token->opr.c = c;
2109 token_len = 1;
2110 break;
2111 }
2112 return token_len;
2113 }
2114 switch (c)
2115 {
2116 case '-':
2117 token->type = OP_CHARSET_RANGE;
2118 break;
2119 case ']':
2120 token->type = OP_CLOSE_BRACKET;
2121 break;
2122 case '^':
2123 token->type = OP_NON_MATCH_LIST;
2124 break;
2125 default:
2126 token->type = CHARACTER;
2127 }
2128 return 1;
2129 }
2130 \f
2131 /* Functions for parser. */
2132
2133 /* Entry point of the parser.
2134 Parse the regular expression REGEXP and return the structure tree.
2135 If an error occurs, ERR is set by error code, and return NULL.
2136 This function build the following tree, from regular expression <reg_exp>:
2137 CAT
2138 / \
2139 / \
2140 <reg_exp> EOR
2141
2142 CAT means concatenation.
2143 EOR means end of regular expression. */
2144
2145 static bin_tree_t *
2146 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2147 reg_errcode_t *err)
2148 {
2149 re_dfa_t *dfa = preg->buffer;
2150 bin_tree_t *tree, *eor, *root;
2151 re_token_t current_token;
2152 dfa->syntax = syntax;
2153 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2154 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2155 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2156 return NULL;
2157 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2158 if (tree != NULL)
2159 root = create_tree (dfa, tree, eor, CONCAT);
2160 else
2161 root = eor;
2162 if (BE (eor == NULL || root == NULL, 0))
2163 {
2164 *err = REG_ESPACE;
2165 return NULL;
2166 }
2167 return root;
2168 }
2169
2170 /* This function build the following tree, from regular expression
2171 <branch1>|<branch2>:
2172 ALT
2173 / \
2174 / \
2175 <branch1> <branch2>
2176
2177 ALT means alternative, which represents the operator '|'. */
2178
2179 static bin_tree_t *
2180 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2181 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2182 {
2183 re_dfa_t *dfa = preg->buffer;
2184 bin_tree_t *tree, *branch = NULL;
2185 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2186 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2187 return NULL;
2188
2189 while (token->type == OP_ALT)
2190 {
2191 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2192 if (token->type != OP_ALT && token->type != END_OF_RE
2193 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2194 {
2195 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2196 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2197 return NULL;
2198 }
2199 else
2200 branch = NULL;
2201 tree = create_tree (dfa, tree, branch, OP_ALT);
2202 if (BE (tree == NULL, 0))
2203 {
2204 *err = REG_ESPACE;
2205 return NULL;
2206 }
2207 }
2208 return tree;
2209 }
2210
2211 /* This function build the following tree, from regular expression
2212 <exp1><exp2>:
2213 CAT
2214 / \
2215 / \
2216 <exp1> <exp2>
2217
2218 CAT means concatenation. */
2219
2220 static bin_tree_t *
2221 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2222 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2223 {
2224 bin_tree_t *tree, *expr;
2225 re_dfa_t *dfa = preg->buffer;
2226 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2227 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2228 return NULL;
2229
2230 while (token->type != OP_ALT && token->type != END_OF_RE
2231 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2232 {
2233 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2234 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2235 {
2236 if (tree != NULL)
2237 postorder (tree, free_tree, NULL);
2238 return NULL;
2239 }
2240 if (tree != NULL && expr != NULL)
2241 {
2242 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2243 if (newtree == NULL)
2244 {
2245 postorder (expr, free_tree, NULL);
2246 postorder (tree, free_tree, NULL);
2247 *err = REG_ESPACE;
2248 return NULL;
2249 }
2250 tree = newtree;
2251 }
2252 else if (tree == NULL)
2253 tree = expr;
2254 /* Otherwise expr == NULL, we don't need to create new tree. */
2255 }
2256 return tree;
2257 }
2258
2259 /* This function build the following tree, from regular expression a*:
2260 *
2261 |
2262 a
2263 */
2264
2265 static bin_tree_t *
2266 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2267 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2268 {
2269 re_dfa_t *dfa = preg->buffer;
2270 bin_tree_t *tree;
2271 switch (token->type)
2272 {
2273 case CHARACTER:
2274 tree = create_token_tree (dfa, NULL, NULL, token);
2275 if (BE (tree == NULL, 0))
2276 {
2277 *err = REG_ESPACE;
2278 return NULL;
2279 }
2280 #ifdef RE_ENABLE_I18N
2281 if (dfa->mb_cur_max > 1)
2282 {
2283 while (!re_string_eoi (regexp)
2284 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2285 {
2286 bin_tree_t *mbc_remain;
2287 fetch_token (token, regexp, syntax);
2288 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2289 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2290 if (BE (mbc_remain == NULL || tree == NULL, 0))
2291 {
2292 *err = REG_ESPACE;
2293 return NULL;
2294 }
2295 }
2296 }
2297 #endif
2298 break;
2299 case OP_OPEN_SUBEXP:
2300 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2301 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2302 return NULL;
2303 break;
2304 case OP_OPEN_BRACKET:
2305 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2306 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307 return NULL;
2308 break;
2309 case OP_BACK_REF:
2310 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2311 {
2312 *err = REG_ESUBREG;
2313 return NULL;
2314 }
2315 dfa->used_bkref_map |= 1 << token->opr.idx;
2316 tree = create_token_tree (dfa, NULL, NULL, token);
2317 if (BE (tree == NULL, 0))
2318 {
2319 *err = REG_ESPACE;
2320 return NULL;
2321 }
2322 ++dfa->nbackref;
2323 dfa->has_mb_node = 1;
2324 break;
2325 case OP_OPEN_DUP_NUM:
2326 if (syntax & RE_CONTEXT_INVALID_DUP)
2327 {
2328 *err = REG_BADRPT;
2329 return NULL;
2330 }
2331 /* FALLTHROUGH */
2332 case OP_DUP_ASTERISK:
2333 case OP_DUP_PLUS:
2334 case OP_DUP_QUESTION:
2335 if (syntax & RE_CONTEXT_INVALID_OPS)
2336 {
2337 *err = REG_BADRPT;
2338 return NULL;
2339 }
2340 else if (syntax & RE_CONTEXT_INDEP_OPS)
2341 {
2342 fetch_token (token, regexp, syntax);
2343 return parse_expression (regexp, preg, token, syntax, nest, err);
2344 }
2345 /* else fall through */
2346 case OP_CLOSE_SUBEXP:
2347 if ((token->type == OP_CLOSE_SUBEXP) &&
2348 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2349 {
2350 *err = REG_ERPAREN;
2351 return NULL;
2352 }
2353 /* else fall through */
2354 case OP_CLOSE_DUP_NUM:
2355 /* We treat it as a normal character. */
2356
2357 /* Then we can these characters as normal characters. */
2358 token->type = CHARACTER;
2359 /* mb_partial and word_char bits should be initialized already
2360 by peek_token. */
2361 tree = create_token_tree (dfa, NULL, NULL, token);
2362 if (BE (tree == NULL, 0))
2363 {
2364 *err = REG_ESPACE;
2365 return NULL;
2366 }
2367 break;
2368 case ANCHOR:
2369 if ((token->opr.ctx_type
2370 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2371 && dfa->word_ops_used == 0)
2372 init_word_char (dfa);
2373 if (token->opr.ctx_type == WORD_DELIM
2374 || token->opr.ctx_type == NOT_WORD_DELIM)
2375 {
2376 bin_tree_t *tree_first, *tree_last;
2377 if (token->opr.ctx_type == WORD_DELIM)
2378 {
2379 token->opr.ctx_type = WORD_FIRST;
2380 tree_first = create_token_tree (dfa, NULL, NULL, token);
2381 token->opr.ctx_type = WORD_LAST;
2382 }
2383 else
2384 {
2385 token->opr.ctx_type = INSIDE_WORD;
2386 tree_first = create_token_tree (dfa, NULL, NULL, token);
2387 token->opr.ctx_type = INSIDE_NOTWORD;
2388 }
2389 tree_last = create_token_tree (dfa, NULL, NULL, token);
2390 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2391 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2392 {
2393 *err = REG_ESPACE;
2394 return NULL;
2395 }
2396 }
2397 else
2398 {
2399 tree = create_token_tree (dfa, NULL, NULL, token);
2400 if (BE (tree == NULL, 0))
2401 {
2402 *err = REG_ESPACE;
2403 return NULL;
2404 }
2405 }
2406 /* We must return here, since ANCHORs can't be followed
2407 by repetition operators.
2408 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2409 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2410 fetch_token (token, regexp, syntax);
2411 return tree;
2412 case OP_PERIOD:
2413 tree = create_token_tree (dfa, NULL, NULL, token);
2414 if (BE (tree == NULL, 0))
2415 {
2416 *err = REG_ESPACE;
2417 return NULL;
2418 }
2419 if (dfa->mb_cur_max > 1)
2420 dfa->has_mb_node = 1;
2421 break;
2422 case OP_WORD:
2423 case OP_NOTWORD:
2424 tree = build_charclass_op (dfa, regexp->trans,
2425 "alnum",
2426 "_",
2427 token->type == OP_NOTWORD, err);
2428 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2429 return NULL;
2430 break;
2431 case OP_SPACE:
2432 case OP_NOTSPACE:
2433 tree = build_charclass_op (dfa, regexp->trans,
2434 "space",
2435 "",
2436 token->type == OP_NOTSPACE, err);
2437 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2438 return NULL;
2439 break;
2440 case OP_ALT:
2441 case END_OF_RE:
2442 return NULL;
2443 case BACK_SLASH:
2444 *err = REG_EESCAPE;
2445 return NULL;
2446 default:
2447 /* Must not happen? */
2448 #ifdef DEBUG
2449 assert (0);
2450 #endif
2451 return NULL;
2452 }
2453 fetch_token (token, regexp, syntax);
2454
2455 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2456 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2457 {
2458 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2459 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2460 return NULL;
2461 /* In BRE consecutive duplications are not allowed. */
2462 if ((syntax & RE_CONTEXT_INVALID_DUP)
2463 && (token->type == OP_DUP_ASTERISK
2464 || token->type == OP_OPEN_DUP_NUM))
2465 {
2466 *err = REG_BADRPT;
2467 return NULL;
2468 }
2469 }
2470
2471 return tree;
2472 }
2473
2474 /* This function build the following tree, from regular expression
2475 (<reg_exp>):
2476 SUBEXP
2477 |
2478 <reg_exp>
2479 */
2480
2481 static bin_tree_t *
2482 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2483 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2484 {
2485 re_dfa_t *dfa = preg->buffer;
2486 bin_tree_t *tree;
2487 size_t cur_nsub;
2488 cur_nsub = preg->re_nsub++;
2489
2490 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2491
2492 /* The subexpression may be a null string. */
2493 if (token->type == OP_CLOSE_SUBEXP)
2494 tree = NULL;
2495 else
2496 {
2497 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2498 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2499 {
2500 if (tree != NULL)
2501 postorder (tree, free_tree, NULL);
2502 *err = REG_EPAREN;
2503 }
2504 if (BE (*err != REG_NOERROR, 0))
2505 return NULL;
2506 }
2507
2508 if (cur_nsub <= '9' - '1')
2509 dfa->completed_bkref_map |= 1 << cur_nsub;
2510
2511 tree = create_tree (dfa, tree, NULL, SUBEXP);
2512 if (BE (tree == NULL, 0))
2513 {
2514 *err = REG_ESPACE;
2515 return NULL;
2516 }
2517 tree->token.opr.idx = cur_nsub;
2518 return tree;
2519 }
2520
2521 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2522
2523 static bin_tree_t *
2524 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2525 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2526 {
2527 bin_tree_t *tree = NULL, *old_tree = NULL;
2528 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2529 re_token_t start_token = *token;
2530
2531 if (token->type == OP_OPEN_DUP_NUM)
2532 {
2533 end = 0;
2534 start = fetch_number (regexp, token, syntax);
2535 if (start == REG_MISSING)
2536 {
2537 if (token->type == CHARACTER && token->opr.c == ',')
2538 start = 0; /* We treat "{,m}" as "{0,m}". */
2539 else
2540 {
2541 *err = REG_BADBR; /* <re>{} is invalid. */
2542 return NULL;
2543 }
2544 }
2545 if (BE (start != REG_ERROR, 1))
2546 {
2547 /* We treat "{n}" as "{n,n}". */
2548 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2549 : ((token->type == CHARACTER && token->opr.c == ',')
2550 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2551 }
2552 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2553 {
2554 /* Invalid sequence. */
2555 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2556 {
2557 if (token->type == END_OF_RE)
2558 *err = REG_EBRACE;
2559 else
2560 *err = REG_BADBR;
2561
2562 return NULL;
2563 }
2564
2565 /* If the syntax bit is set, rollback. */
2566 re_string_set_index (regexp, start_idx);
2567 *token = start_token;
2568 token->type = CHARACTER;
2569 /* mb_partial and word_char bits should be already initialized by
2570 peek_token. */
2571 return elem;
2572 }
2573
2574 if (BE ((end != REG_MISSING && start > end)
2575 || token->type != OP_CLOSE_DUP_NUM, 0))
2576 {
2577 /* First number greater than second. */
2578 *err = REG_BADBR;
2579 return NULL;
2580 }
2581
2582 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2583 {
2584 *err = REG_ESIZE;
2585 return NULL;
2586 }
2587 }
2588 else
2589 {
2590 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2591 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2592 }
2593
2594 fetch_token (token, regexp, syntax);
2595
2596 if (BE (elem == NULL, 0))
2597 return NULL;
2598 if (BE (start == 0 && end == 0, 0))
2599 {
2600 postorder (elem, free_tree, NULL);
2601 return NULL;
2602 }
2603
2604 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2605 if (BE (start > 0, 0))
2606 {
2607 tree = elem;
2608 for (i = 2; i <= start; ++i)
2609 {
2610 elem = duplicate_tree (elem, dfa);
2611 tree = create_tree (dfa, tree, elem, CONCAT);
2612 if (BE (elem == NULL || tree == NULL, 0))
2613 goto parse_dup_op_espace;
2614 }
2615
2616 if (start == end)
2617 return tree;
2618
2619 /* Duplicate ELEM before it is marked optional. */
2620 elem = duplicate_tree (elem, dfa);
2621 old_tree = tree;
2622 }
2623 else
2624 old_tree = NULL;
2625
2626 if (elem->token.type == SUBEXP)
2627 {
2628 uintptr_t subidx = elem->token.opr.idx;
2629 postorder (elem, mark_opt_subexp, (void *) subidx);
2630 }
2631
2632 tree = create_tree (dfa, elem, NULL,
2633 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2634 if (BE (tree == NULL, 0))
2635 goto parse_dup_op_espace;
2636
2637 /* From gnulib's "intprops.h":
2638 True if the arithmetic type T is signed. */
2639 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2640
2641 /* This loop is actually executed only when end != REG_MISSING,
2642 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2643 already created the start+1-th copy. */
2644 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2645 for (i = start + 2; i <= end; ++i)
2646 {
2647 elem = duplicate_tree (elem, dfa);
2648 tree = create_tree (dfa, tree, elem, CONCAT);
2649 if (BE (elem == NULL || tree == NULL, 0))
2650 goto parse_dup_op_espace;
2651
2652 tree = create_tree (dfa, tree, NULL, OP_ALT);
2653 if (BE (tree == NULL, 0))
2654 goto parse_dup_op_espace;
2655 }
2656
2657 if (old_tree)
2658 tree = create_tree (dfa, old_tree, tree, CONCAT);
2659
2660 return tree;
2661
2662 parse_dup_op_espace:
2663 *err = REG_ESPACE;
2664 return NULL;
2665 }
2666
2667 /* Size of the names for collating symbol/equivalence_class/character_class.
2668 I'm not sure, but maybe enough. */
2669 #define BRACKET_NAME_BUF_SIZE 32
2670
2671 #ifndef _LIBC
2672 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2673 Build the range expression which starts from START_ELEM, and ends
2674 at END_ELEM. The result are written to MBCSET and SBCSET.
2675 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2676 mbcset->range_ends, is a pointer argument since we may
2677 update it. */
2678
2679 static reg_errcode_t
2680 internal_function
2681 # ifdef RE_ENABLE_I18N
2682 build_range_exp (const reg_syntax_t syntax,
2683 bitset_t sbcset,
2684 re_charset_t *mbcset,
2685 Idx *range_alloc,
2686 const bracket_elem_t *start_elem,
2687 const bracket_elem_t *end_elem)
2688 # else /* not RE_ENABLE_I18N */
2689 build_range_exp (const reg_syntax_t syntax,
2690 bitset_t sbcset,
2691 const bracket_elem_t *start_elem,
2692 const bracket_elem_t *end_elem)
2693 # endif /* not RE_ENABLE_I18N */
2694 {
2695 unsigned int start_ch, end_ch;
2696 /* Equivalence Classes and Character Classes can't be a range start/end. */
2697 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2698 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2699 0))
2700 return REG_ERANGE;
2701
2702 /* We can handle no multi character collating elements without libc
2703 support. */
2704 if (BE ((start_elem->type == COLL_SYM
2705 && strlen ((char *) start_elem->opr.name) > 1)
2706 || (end_elem->type == COLL_SYM
2707 && strlen ((char *) end_elem->opr.name) > 1), 0))
2708 return REG_ECOLLATE;
2709
2710 # ifdef RE_ENABLE_I18N
2711 {
2712 wchar_t wc;
2713 wint_t start_wc;
2714 wint_t end_wc;
2715
2716 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2718 : 0));
2719 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2721 : 0));
2722 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2723 ? __btowc (start_ch) : start_elem->opr.wch);
2724 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2725 ? __btowc (end_ch) : end_elem->opr.wch);
2726 if (start_wc == WEOF || end_wc == WEOF)
2727 return REG_ECOLLATE;
2728 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2729 return REG_ERANGE;
2730
2731 /* Got valid collation sequence values, add them as a new entry.
2732 However, for !_LIBC we have no collation elements: if the
2733 character set is single byte, the single byte character set
2734 that we build below suffices. parse_bracket_exp passes
2735 no MBCSET if dfa->mb_cur_max == 1. */
2736 if (mbcset)
2737 {
2738 /* Check the space of the arrays. */
2739 if (BE (*range_alloc == mbcset->nranges, 0))
2740 {
2741 /* There is not enough space, need realloc. */
2742 wchar_t *new_array_start, *new_array_end;
2743 Idx new_nranges;
2744
2745 /* +1 in case of mbcset->nranges is 0. */
2746 new_nranges = 2 * mbcset->nranges + 1;
2747 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2748 are NULL if *range_alloc == 0. */
2749 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2750 new_nranges);
2751 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2752 new_nranges);
2753
2754 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2755 return REG_ESPACE;
2756
2757 mbcset->range_starts = new_array_start;
2758 mbcset->range_ends = new_array_end;
2759 *range_alloc = new_nranges;
2760 }
2761
2762 mbcset->range_starts[mbcset->nranges] = start_wc;
2763 mbcset->range_ends[mbcset->nranges++] = end_wc;
2764 }
2765
2766 /* Build the table for single byte characters. */
2767 for (wc = 0; wc < SBC_MAX; ++wc)
2768 {
2769 if (start_wc <= wc && wc <= end_wc)
2770 bitset_set (sbcset, wc);
2771 }
2772 }
2773 # else /* not RE_ENABLE_I18N */
2774 {
2775 unsigned int ch;
2776 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2777 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2778 : 0));
2779 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2780 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2781 : 0));
2782 if (start_ch > end_ch)
2783 return REG_ERANGE;
2784 /* Build the table for single byte characters. */
2785 for (ch = 0; ch < SBC_MAX; ++ch)
2786 if (start_ch <= ch && ch <= end_ch)
2787 bitset_set (sbcset, ch);
2788 }
2789 # endif /* not RE_ENABLE_I18N */
2790 return REG_NOERROR;
2791 }
2792 #endif /* not _LIBC */
2793
2794 #ifndef _LIBC
2795 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2796 Build the collating element which is represented by NAME.
2797 The result are written to MBCSET and SBCSET.
2798 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2799 pointer argument since we may update it. */
2800
2801 static reg_errcode_t
2802 internal_function
2803 # ifdef RE_ENABLE_I18N
2804 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2805 Idx *coll_sym_alloc, const unsigned char *name)
2806 # else /* not RE_ENABLE_I18N */
2807 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2808 # endif /* not RE_ENABLE_I18N */
2809 {
2810 size_t name_len = strlen ((const char *) name);
2811 if (BE (name_len != 1, 0))
2812 return REG_ECOLLATE;
2813 else
2814 {
2815 bitset_set (sbcset, name[0]);
2816 return REG_NOERROR;
2817 }
2818 }
2819 #endif /* not _LIBC */
2820
2821 /* This function parse bracket expression like "[abc]", "[a-c]",
2822 "[[.a-a.]]" etc. */
2823
2824 static bin_tree_t *
2825 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2826 reg_syntax_t syntax, reg_errcode_t *err)
2827 {
2828 #ifdef _LIBC
2829 const unsigned char *collseqmb;
2830 const char *collseqwc;
2831 uint32_t nrules;
2832 int32_t table_size;
2833 const int32_t *symb_table;
2834 const unsigned char *extra;
2835
2836 /* Local function for parse_bracket_exp used in _LIBC environment.
2837 Seek the collating symbol entry corresponding to NAME.
2838 Return the index of the symbol in the SYMB_TABLE. */
2839
2840 auto inline int32_t
2841 __attribute ((always_inline))
2842 seek_collating_symbol_entry (name, name_len)
2843 const unsigned char *name;
2844 size_t name_len;
2845 {
2846 int32_t hash = elem_hash ((const char *) name, name_len);
2847 int32_t elem = hash % table_size;
2848 if (symb_table[2 * elem] != 0)
2849 {
2850 int32_t second = hash % (table_size - 2) + 1;
2851
2852 do
2853 {
2854 /* First compare the hashing value. */
2855 if (symb_table[2 * elem] == hash
2856 /* Compare the length of the name. */
2857 && name_len == extra[symb_table[2 * elem + 1]]
2858 /* Compare the name. */
2859 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2860 name_len) == 0)
2861 {
2862 /* Yep, this is the entry. */
2863 break;
2864 }
2865
2866 /* Next entry. */
2867 elem += second;
2868 }
2869 while (symb_table[2 * elem] != 0);
2870 }
2871 return elem;
2872 }
2873
2874 /* Local function for parse_bracket_exp used in _LIBC environment.
2875 Look up the collation sequence value of BR_ELEM.
2876 Return the value if succeeded, UINT_MAX otherwise. */
2877
2878 auto inline unsigned int
2879 __attribute ((always_inline))
2880 lookup_collation_sequence_value (br_elem)
2881 bracket_elem_t *br_elem;
2882 {
2883 if (br_elem->type == SB_CHAR)
2884 {
2885 /*
2886 if (MB_CUR_MAX == 1)
2887 */
2888 if (nrules == 0)
2889 return collseqmb[br_elem->opr.ch];
2890 else
2891 {
2892 wint_t wc = __btowc (br_elem->opr.ch);
2893 return __collseq_table_lookup (collseqwc, wc);
2894 }
2895 }
2896 else if (br_elem->type == MB_CHAR)
2897 {
2898 if (nrules != 0)
2899 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2900 }
2901 else if (br_elem->type == COLL_SYM)
2902 {
2903 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2904 if (nrules != 0)
2905 {
2906 int32_t elem, idx;
2907 elem = seek_collating_symbol_entry (br_elem->opr.name,
2908 sym_name_len);
2909 if (symb_table[2 * elem] != 0)
2910 {
2911 /* We found the entry. */
2912 idx = symb_table[2 * elem + 1];
2913 /* Skip the name of collating element name. */
2914 idx += 1 + extra[idx];
2915 /* Skip the byte sequence of the collating element. */
2916 idx += 1 + extra[idx];
2917 /* Adjust for the alignment. */
2918 idx = (idx + 3) & ~3;
2919 /* Skip the multibyte collation sequence value. */
2920 idx += sizeof (unsigned int);
2921 /* Skip the wide char sequence of the collating element. */
2922 idx += sizeof (unsigned int) *
2923 (1 + *(unsigned int *) (extra + idx));
2924 /* Return the collation sequence value. */
2925 return *(unsigned int *) (extra + idx);
2926 }
2927 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2928 {
2929 /* No valid character. Match it as a single byte
2930 character. */
2931 return collseqmb[br_elem->opr.name[0]];
2932 }
2933 }
2934 else if (sym_name_len == 1)
2935 return collseqmb[br_elem->opr.name[0]];
2936 }
2937 return UINT_MAX;
2938 }
2939
2940 /* Local function for parse_bracket_exp used in _LIBC environment.
2941 Build the range expression which starts from START_ELEM, and ends
2942 at END_ELEM. The result are written to MBCSET and SBCSET.
2943 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2944 mbcset->range_ends, is a pointer argument since we may
2945 update it. */
2946
2947 auto inline reg_errcode_t
2948 __attribute ((always_inline))
2949 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2950 re_charset_t *mbcset;
2951 Idx *range_alloc;
2952 bitset_t sbcset;
2953 bracket_elem_t *start_elem, *end_elem;
2954 {
2955 unsigned int ch;
2956 uint32_t start_collseq;
2957 uint32_t end_collseq;
2958
2959 /* Equivalence Classes and Character Classes can't be a range
2960 start/end. */
2961 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2962 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2963 0))
2964 return REG_ERANGE;
2965
2966 /* FIXME: Implement rational ranges here, too. */
2967 start_collseq = lookup_collation_sequence_value (start_elem);
2968 end_collseq = lookup_collation_sequence_value (end_elem);
2969 /* Check start/end collation sequence values. */
2970 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2971 return REG_ECOLLATE;
2972 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2973 return REG_ERANGE;
2974
2975 /* Got valid collation sequence values, add them as a new entry.
2976 However, if we have no collation elements, and the character set
2977 is single byte, the single byte character set that we
2978 build below suffices. */
2979 if (nrules > 0 || dfa->mb_cur_max > 1)
2980 {
2981 /* Check the space of the arrays. */
2982 if (BE (*range_alloc == mbcset->nranges, 0))
2983 {
2984 /* There is not enough space, need realloc. */
2985 uint32_t *new_array_start;
2986 uint32_t *new_array_end;
2987 Idx new_nranges;
2988
2989 /* +1 in case of mbcset->nranges is 0. */
2990 new_nranges = 2 * mbcset->nranges + 1;
2991 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2992 new_nranges);
2993 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2994 new_nranges);
2995
2996 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2997 return REG_ESPACE;
2998
2999 mbcset->range_starts = new_array_start;
3000 mbcset->range_ends = new_array_end;
3001 *range_alloc = new_nranges;
3002 }
3003
3004 mbcset->range_starts[mbcset->nranges] = start_collseq;
3005 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3006 }
3007
3008 /* Build the table for single byte characters. */
3009 for (ch = 0; ch < SBC_MAX; ch++)
3010 {
3011 uint32_t ch_collseq;
3012 /*
3013 if (MB_CUR_MAX == 1)
3014 */
3015 if (nrules == 0)
3016 ch_collseq = collseqmb[ch];
3017 else
3018 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3019 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3020 bitset_set (sbcset, ch);
3021 }
3022 return REG_NOERROR;
3023 }
3024
3025 /* Local function for parse_bracket_exp used in _LIBC environment.
3026 Build the collating element which is represented by NAME.
3027 The result are written to MBCSET and SBCSET.
3028 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3029 pointer argument since we may update it. */
3030
3031 auto inline reg_errcode_t
3032 __attribute ((always_inline))
3033 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3034 re_charset_t *mbcset;
3035 Idx *coll_sym_alloc;
3036 bitset_t sbcset;
3037 const unsigned char *name;
3038 {
3039 int32_t elem, idx;
3040 size_t name_len = strlen ((const char *) name);
3041 if (nrules != 0)
3042 {
3043 elem = seek_collating_symbol_entry (name, name_len);
3044 if (symb_table[2 * elem] != 0)
3045 {
3046 /* We found the entry. */
3047 idx = symb_table[2 * elem + 1];
3048 /* Skip the name of collating element name. */
3049 idx += 1 + extra[idx];
3050 }
3051 else if (symb_table[2 * elem] == 0 && name_len == 1)
3052 {
3053 /* No valid character, treat it as a normal
3054 character. */
3055 bitset_set (sbcset, name[0]);
3056 return REG_NOERROR;
3057 }
3058 else
3059 return REG_ECOLLATE;
3060
3061 /* Got valid collation sequence, add it as a new entry. */
3062 /* Check the space of the arrays. */
3063 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3064 {
3065 /* Not enough, realloc it. */
3066 /* +1 in case of mbcset->ncoll_syms is 0. */
3067 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3068 /* Use realloc since mbcset->coll_syms is NULL
3069 if *alloc == 0. */
3070 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3071 new_coll_sym_alloc);
3072 if (BE (new_coll_syms == NULL, 0))
3073 return REG_ESPACE;
3074 mbcset->coll_syms = new_coll_syms;
3075 *coll_sym_alloc = new_coll_sym_alloc;
3076 }
3077 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3078 return REG_NOERROR;
3079 }
3080 else
3081 {
3082 if (BE (name_len != 1, 0))
3083 return REG_ECOLLATE;
3084 else
3085 {
3086 bitset_set (sbcset, name[0]);
3087 return REG_NOERROR;
3088 }
3089 }
3090 }
3091 #endif
3092
3093 re_token_t br_token;
3094 re_bitset_ptr_t sbcset;
3095 #ifdef RE_ENABLE_I18N
3096 re_charset_t *mbcset;
3097 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3098 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3099 #endif /* not RE_ENABLE_I18N */
3100 bool non_match = false;
3101 bin_tree_t *work_tree;
3102 int token_len;
3103 bool first_round = true;
3104 #ifdef _LIBC
3105 collseqmb = (const unsigned char *)
3106 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3107 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3108 if (nrules)
3109 {
3110 /*
3111 if (MB_CUR_MAX > 1)
3112 */
3113 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3114 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3115 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3116 _NL_COLLATE_SYMB_TABLEMB);
3117 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3118 _NL_COLLATE_SYMB_EXTRAMB);
3119 }
3120 #endif
3121 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3122 #ifdef RE_ENABLE_I18N
3123 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3124 #endif /* RE_ENABLE_I18N */
3125 #ifdef RE_ENABLE_I18N
3126 if (BE (sbcset == NULL || mbcset == NULL, 0))
3127 #else
3128 if (BE (sbcset == NULL, 0))
3129 #endif /* RE_ENABLE_I18N */
3130 {
3131 re_free (sbcset);
3132 #ifdef RE_ENABLE_I18N
3133 re_free (mbcset);
3134 #endif
3135 *err = REG_ESPACE;
3136 return NULL;
3137 }
3138
3139 token_len = peek_token_bracket (token, regexp, syntax);
3140 if (BE (token->type == END_OF_RE, 0))
3141 {
3142 *err = REG_BADPAT;
3143 goto parse_bracket_exp_free_return;
3144 }
3145 if (token->type == OP_NON_MATCH_LIST)
3146 {
3147 #ifdef RE_ENABLE_I18N
3148 mbcset->non_match = 1;
3149 #endif /* not RE_ENABLE_I18N */
3150 non_match = true;
3151 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3152 bitset_set (sbcset, '\n');
3153 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3154 token_len = peek_token_bracket (token, regexp, syntax);
3155 if (BE (token->type == END_OF_RE, 0))
3156 {
3157 *err = REG_BADPAT;
3158 goto parse_bracket_exp_free_return;
3159 }
3160 }
3161
3162 /* We treat the first ']' as a normal character. */
3163 if (token->type == OP_CLOSE_BRACKET)
3164 token->type = CHARACTER;
3165
3166 while (1)
3167 {
3168 bracket_elem_t start_elem, end_elem;
3169 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3170 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3171 reg_errcode_t ret;
3172 int token_len2 = 0;
3173 bool is_range_exp = false;
3174 re_token_t token2;
3175
3176 start_elem.opr.name = start_name_buf;
3177 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3178 syntax, first_round);
3179 if (BE (ret != REG_NOERROR, 0))
3180 {
3181 *err = ret;
3182 goto parse_bracket_exp_free_return;
3183 }
3184 first_round = false;
3185
3186 /* Get information about the next token. We need it in any case. */
3187 token_len = peek_token_bracket (token, regexp, syntax);
3188
3189 /* Do not check for ranges if we know they are not allowed. */
3190 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3191 {
3192 if (BE (token->type == END_OF_RE, 0))
3193 {
3194 *err = REG_EBRACK;
3195 goto parse_bracket_exp_free_return;
3196 }
3197 if (token->type == OP_CHARSET_RANGE)
3198 {
3199 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3200 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3201 if (BE (token2.type == END_OF_RE, 0))
3202 {
3203 *err = REG_EBRACK;
3204 goto parse_bracket_exp_free_return;
3205 }
3206 if (token2.type == OP_CLOSE_BRACKET)
3207 {
3208 /* We treat the last '-' as a normal character. */
3209 re_string_skip_bytes (regexp, -token_len);
3210 token->type = CHARACTER;
3211 }
3212 else
3213 is_range_exp = true;
3214 }
3215 }
3216
3217 if (is_range_exp == true)
3218 {
3219 end_elem.opr.name = end_name_buf;
3220 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3221 dfa, syntax, true);
3222 if (BE (ret != REG_NOERROR, 0))
3223 {
3224 *err = ret;
3225 goto parse_bracket_exp_free_return;
3226 }
3227
3228 token_len = peek_token_bracket (token, regexp, syntax);
3229
3230 #ifdef _LIBC
3231 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3232 &start_elem, &end_elem);
3233 #else
3234 # ifdef RE_ENABLE_I18N
3235 *err = build_range_exp (syntax, sbcset,
3236 dfa->mb_cur_max > 1 ? mbcset : NULL,
3237 &range_alloc, &start_elem, &end_elem);
3238 # else
3239 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3240 # endif
3241 #endif /* RE_ENABLE_I18N */
3242 if (BE (*err != REG_NOERROR, 0))
3243 goto parse_bracket_exp_free_return;
3244 }
3245 else
3246 {
3247 switch (start_elem.type)
3248 {
3249 case SB_CHAR:
3250 bitset_set (sbcset, start_elem.opr.ch);
3251 break;
3252 #ifdef RE_ENABLE_I18N
3253 case MB_CHAR:
3254 /* Check whether the array has enough space. */
3255 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3256 {
3257 wchar_t *new_mbchars;
3258 /* Not enough, realloc it. */
3259 /* +1 in case of mbcset->nmbchars is 0. */
3260 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3261 /* Use realloc since array is NULL if *alloc == 0. */
3262 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3263 mbchar_alloc);
3264 if (BE (new_mbchars == NULL, 0))
3265 goto parse_bracket_exp_espace;
3266 mbcset->mbchars = new_mbchars;
3267 }
3268 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3269 break;
3270 #endif /* RE_ENABLE_I18N */
3271 case EQUIV_CLASS:
3272 *err = build_equiv_class (sbcset,
3273 #ifdef RE_ENABLE_I18N
3274 mbcset, &equiv_class_alloc,
3275 #endif /* RE_ENABLE_I18N */
3276 start_elem.opr.name);
3277 if (BE (*err != REG_NOERROR, 0))
3278 goto parse_bracket_exp_free_return;
3279 break;
3280 case COLL_SYM:
3281 *err = build_collating_symbol (sbcset,
3282 #ifdef RE_ENABLE_I18N
3283 mbcset, &coll_sym_alloc,
3284 #endif /* RE_ENABLE_I18N */
3285 start_elem.opr.name);
3286 if (BE (*err != REG_NOERROR, 0))
3287 goto parse_bracket_exp_free_return;
3288 break;
3289 case CHAR_CLASS:
3290 *err = build_charclass (regexp->trans, sbcset,
3291 #ifdef RE_ENABLE_I18N
3292 mbcset, &char_class_alloc,
3293 #endif /* RE_ENABLE_I18N */
3294 (const char *) start_elem.opr.name,
3295 syntax);
3296 if (BE (*err != REG_NOERROR, 0))
3297 goto parse_bracket_exp_free_return;
3298 break;
3299 default:
3300 assert (0);
3301 break;
3302 }
3303 }
3304 if (BE (token->type == END_OF_RE, 0))
3305 {
3306 *err = REG_EBRACK;
3307 goto parse_bracket_exp_free_return;
3308 }
3309 if (token->type == OP_CLOSE_BRACKET)
3310 break;
3311 }
3312
3313 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3314
3315 /* If it is non-matching list. */
3316 if (non_match)
3317 bitset_not (sbcset);
3318
3319 #ifdef RE_ENABLE_I18N
3320 /* Ensure only single byte characters are set. */
3321 if (dfa->mb_cur_max > 1)
3322 bitset_mask (sbcset, dfa->sb_char);
3323
3324 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3325 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3326 || mbcset->non_match)))
3327 {
3328 bin_tree_t *mbc_tree;
3329 int sbc_idx;
3330 /* Build a tree for complex bracket. */
3331 dfa->has_mb_node = 1;
3332 br_token.type = COMPLEX_BRACKET;
3333 br_token.opr.mbcset = mbcset;
3334 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3335 if (BE (mbc_tree == NULL, 0))
3336 goto parse_bracket_exp_espace;
3337 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3338 if (sbcset[sbc_idx])
3339 break;
3340 /* If there are no bits set in sbcset, there is no point
3341 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3342 if (sbc_idx < BITSET_WORDS)
3343 {
3344 /* Build a tree for simple bracket. */
3345 br_token.type = SIMPLE_BRACKET;
3346 br_token.opr.sbcset = sbcset;
3347 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3348 if (BE (work_tree == NULL, 0))
3349 goto parse_bracket_exp_espace;
3350
3351 /* Then join them by ALT node. */
3352 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3353 if (BE (work_tree == NULL, 0))
3354 goto parse_bracket_exp_espace;
3355 }
3356 else
3357 {
3358 re_free (sbcset);
3359 work_tree = mbc_tree;
3360 }
3361 }
3362 else
3363 #endif /* not RE_ENABLE_I18N */
3364 {
3365 #ifdef RE_ENABLE_I18N
3366 free_charset (mbcset);
3367 #endif
3368 /* Build a tree for simple bracket. */
3369 br_token.type = SIMPLE_BRACKET;
3370 br_token.opr.sbcset = sbcset;
3371 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3372 if (BE (work_tree == NULL, 0))
3373 goto parse_bracket_exp_espace;
3374 }
3375 return work_tree;
3376
3377 parse_bracket_exp_espace:
3378 *err = REG_ESPACE;
3379 parse_bracket_exp_free_return:
3380 re_free (sbcset);
3381 #ifdef RE_ENABLE_I18N
3382 free_charset (mbcset);
3383 #endif /* RE_ENABLE_I18N */
3384 return NULL;
3385 }
3386
3387 /* Parse an element in the bracket expression. */
3388
3389 static reg_errcode_t
3390 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3391 re_token_t *token, int token_len, re_dfa_t *dfa,
3392 reg_syntax_t syntax, bool accept_hyphen)
3393 {
3394 #ifdef RE_ENABLE_I18N
3395 int cur_char_size;
3396 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3397 if (cur_char_size > 1)
3398 {
3399 elem->type = MB_CHAR;
3400 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3401 re_string_skip_bytes (regexp, cur_char_size);
3402 return REG_NOERROR;
3403 }
3404 #endif /* RE_ENABLE_I18N */
3405 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3406 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3407 || token->type == OP_OPEN_EQUIV_CLASS)
3408 return parse_bracket_symbol (elem, regexp, token);
3409 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3410 {
3411 /* A '-' must only appear as anything but a range indicator before
3412 the closing bracket. Everything else is an error. */
3413 re_token_t token2;
3414 (void) peek_token_bracket (&token2, regexp, syntax);
3415 if (token2.type != OP_CLOSE_BRACKET)
3416 /* The actual error value is not standardized since this whole
3417 case is undefined. But ERANGE makes good sense. */
3418 return REG_ERANGE;
3419 }
3420 elem->type = SB_CHAR;
3421 elem->opr.ch = token->opr.c;
3422 return REG_NOERROR;
3423 }
3424
3425 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3426 such as [:<character_class>:], [.<collating_element>.], and
3427 [=<equivalent_class>=]. */
3428
3429 static reg_errcode_t
3430 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3431 re_token_t *token)
3432 {
3433 unsigned char ch, delim = token->opr.c;
3434 int i = 0;
3435 if (re_string_eoi(regexp))
3436 return REG_EBRACK;
3437 for (;; ++i)
3438 {
3439 if (i >= BRACKET_NAME_BUF_SIZE)
3440 return REG_EBRACK;
3441 if (token->type == OP_OPEN_CHAR_CLASS)
3442 ch = re_string_fetch_byte_case (regexp);
3443 else
3444 ch = re_string_fetch_byte (regexp);
3445 if (re_string_eoi(regexp))
3446 return REG_EBRACK;
3447 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3448 break;
3449 elem->opr.name[i] = ch;
3450 }
3451 re_string_skip_bytes (regexp, 1);
3452 elem->opr.name[i] = '\0';
3453 switch (token->type)
3454 {
3455 case OP_OPEN_COLL_ELEM:
3456 elem->type = COLL_SYM;
3457 break;
3458 case OP_OPEN_EQUIV_CLASS:
3459 elem->type = EQUIV_CLASS;
3460 break;
3461 case OP_OPEN_CHAR_CLASS:
3462 elem->type = CHAR_CLASS;
3463 break;
3464 default:
3465 break;
3466 }
3467 return REG_NOERROR;
3468 }
3469
3470 /* Helper function for parse_bracket_exp.
3471 Build the equivalence class which is represented by NAME.
3472 The result are written to MBCSET and SBCSET.
3473 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3474 is a pointer argument since we may update it. */
3475
3476 static reg_errcode_t
3477 #ifdef RE_ENABLE_I18N
3478 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3479 Idx *equiv_class_alloc, const unsigned char *name)
3480 #else /* not RE_ENABLE_I18N */
3481 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3482 #endif /* not RE_ENABLE_I18N */
3483 {
3484 #ifdef _LIBC
3485 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3486 if (nrules != 0)
3487 {
3488 const int32_t *table, *indirect;
3489 const unsigned char *weights, *extra, *cp;
3490 unsigned char char_buf[2];
3491 int32_t idx1, idx2;
3492 unsigned int ch;
3493 size_t len;
3494 /* This #include defines a local function! */
3495 # include <locale/weight.h>
3496 /* Calculate the index for equivalence class. */
3497 cp = name;
3498 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3499 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3500 _NL_COLLATE_WEIGHTMB);
3501 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3502 _NL_COLLATE_EXTRAMB);
3503 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3504 _NL_COLLATE_INDIRECTMB);
3505 idx1 = findidx (&cp, -1);
3506 if (BE (idx1 == 0 || *cp != '\0', 0))
3507 /* This isn't a valid character. */
3508 return REG_ECOLLATE;
3509
3510 /* Build single byte matching table for this equivalence class. */
3511 len = weights[idx1 & 0xffffff];
3512 for (ch = 0; ch < SBC_MAX; ++ch)
3513 {
3514 char_buf[0] = ch;
3515 cp = char_buf;
3516 idx2 = findidx (&cp, 1);
3517 /*
3518 idx2 = table[ch];
3519 */
3520 if (idx2 == 0)
3521 /* This isn't a valid character. */
3522 continue;
3523 /* Compare only if the length matches and the collation rule
3524 index is the same. */
3525 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3526 {
3527 int cnt = 0;
3528
3529 while (cnt <= len &&
3530 weights[(idx1 & 0xffffff) + 1 + cnt]
3531 == weights[(idx2 & 0xffffff) + 1 + cnt])
3532 ++cnt;
3533
3534 if (cnt > len)
3535 bitset_set (sbcset, ch);
3536 }
3537 }
3538 /* Check whether the array has enough space. */
3539 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3540 {
3541 /* Not enough, realloc it. */
3542 /* +1 in case of mbcset->nequiv_classes is 0. */
3543 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3544 /* Use realloc since the array is NULL if *alloc == 0. */
3545 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3546 int32_t,
3547 new_equiv_class_alloc);
3548 if (BE (new_equiv_classes == NULL, 0))
3549 return REG_ESPACE;
3550 mbcset->equiv_classes = new_equiv_classes;
3551 *equiv_class_alloc = new_equiv_class_alloc;
3552 }
3553 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3554 }
3555 else
3556 #endif /* _LIBC */
3557 {
3558 if (BE (strlen ((const char *) name) != 1, 0))
3559 return REG_ECOLLATE;
3560 bitset_set (sbcset, *name);
3561 }
3562 return REG_NOERROR;
3563 }
3564
3565 /* Helper function for parse_bracket_exp.
3566 Build the character class which is represented by NAME.
3567 The result are written to MBCSET and SBCSET.
3568 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3569 is a pointer argument since we may update it. */
3570
3571 static reg_errcode_t
3572 #ifdef RE_ENABLE_I18N
3573 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3574 re_charset_t *mbcset, Idx *char_class_alloc,
3575 const char *class_name, reg_syntax_t syntax)
3576 #else /* not RE_ENABLE_I18N */
3577 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3578 const char *class_name, reg_syntax_t syntax)
3579 #endif /* not RE_ENABLE_I18N */
3580 {
3581 int i;
3582 const char *name = class_name;
3583
3584 /* In case of REG_ICASE "upper" and "lower" match the both of
3585 upper and lower cases. */
3586 if ((syntax & RE_ICASE)
3587 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3588 name = "alpha";
3589
3590 #ifdef RE_ENABLE_I18N
3591 /* Check the space of the arrays. */
3592 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3593 {
3594 /* Not enough, realloc it. */
3595 /* +1 in case of mbcset->nchar_classes is 0. */
3596 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3597 /* Use realloc since array is NULL if *alloc == 0. */
3598 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3599 new_char_class_alloc);
3600 if (BE (new_char_classes == NULL, 0))
3601 return REG_ESPACE;
3602 mbcset->char_classes = new_char_classes;
3603 *char_class_alloc = new_char_class_alloc;
3604 }
3605 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3606 #endif /* RE_ENABLE_I18N */
3607
3608 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3609 do { \
3610 if (BE (trans != NULL, 0)) \
3611 { \
3612 for (i = 0; i < SBC_MAX; ++i) \
3613 if (ctype_func (i)) \
3614 bitset_set (sbcset, trans[i]); \
3615 } \
3616 else \
3617 { \
3618 for (i = 0; i < SBC_MAX; ++i) \
3619 if (ctype_func (i)) \
3620 bitset_set (sbcset, i); \
3621 } \
3622 } while (0)
3623
3624 if (strcmp (name, "alnum") == 0)
3625 BUILD_CHARCLASS_LOOP (isalnum);
3626 else if (strcmp (name, "cntrl") == 0)
3627 BUILD_CHARCLASS_LOOP (iscntrl);
3628 else if (strcmp (name, "lower") == 0)
3629 BUILD_CHARCLASS_LOOP (islower);
3630 else if (strcmp (name, "space") == 0)
3631 BUILD_CHARCLASS_LOOP (isspace);
3632 else if (strcmp (name, "alpha") == 0)
3633 BUILD_CHARCLASS_LOOP (isalpha);
3634 else if (strcmp (name, "digit") == 0)
3635 BUILD_CHARCLASS_LOOP (isdigit);
3636 else if (strcmp (name, "print") == 0)
3637 BUILD_CHARCLASS_LOOP (isprint);
3638 else if (strcmp (name, "upper") == 0)
3639 BUILD_CHARCLASS_LOOP (isupper);
3640 else if (strcmp (name, "blank") == 0)
3641 BUILD_CHARCLASS_LOOP (isblank);
3642 else if (strcmp (name, "graph") == 0)
3643 BUILD_CHARCLASS_LOOP (isgraph);
3644 else if (strcmp (name, "punct") == 0)
3645 BUILD_CHARCLASS_LOOP (ispunct);
3646 else if (strcmp (name, "xdigit") == 0)
3647 BUILD_CHARCLASS_LOOP (isxdigit);
3648 else
3649 return REG_ECTYPE;
3650
3651 return REG_NOERROR;
3652 }
3653
3654 static bin_tree_t *
3655 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3656 const char *class_name,
3657 const char *extra, bool non_match,
3658 reg_errcode_t *err)
3659 {
3660 re_bitset_ptr_t sbcset;
3661 #ifdef RE_ENABLE_I18N
3662 re_charset_t *mbcset;
3663 Idx alloc = 0;
3664 #endif /* not RE_ENABLE_I18N */
3665 reg_errcode_t ret;
3666 re_token_t br_token;
3667 bin_tree_t *tree;
3668
3669 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3670 #ifdef RE_ENABLE_I18N
3671 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3672 #endif /* RE_ENABLE_I18N */
3673
3674 #ifdef RE_ENABLE_I18N
3675 if (BE (sbcset == NULL || mbcset == NULL, 0))
3676 #else /* not RE_ENABLE_I18N */
3677 if (BE (sbcset == NULL, 0))
3678 #endif /* not RE_ENABLE_I18N */
3679 {
3680 *err = REG_ESPACE;
3681 return NULL;
3682 }
3683
3684 if (non_match)
3685 {
3686 #ifdef RE_ENABLE_I18N
3687 mbcset->non_match = 1;
3688 #endif /* not RE_ENABLE_I18N */
3689 }
3690
3691 /* We don't care the syntax in this case. */
3692 ret = build_charclass (trans, sbcset,
3693 #ifdef RE_ENABLE_I18N
3694 mbcset, &alloc,
3695 #endif /* RE_ENABLE_I18N */
3696 class_name, 0);
3697
3698 if (BE (ret != REG_NOERROR, 0))
3699 {
3700 re_free (sbcset);
3701 #ifdef RE_ENABLE_I18N
3702 free_charset (mbcset);
3703 #endif /* RE_ENABLE_I18N */
3704 *err = ret;
3705 return NULL;
3706 }
3707 /* \w match '_' also. */
3708 for (; *extra; extra++)
3709 bitset_set (sbcset, *extra);
3710
3711 /* If it is non-matching list. */
3712 if (non_match)
3713 bitset_not (sbcset);
3714
3715 #ifdef RE_ENABLE_I18N
3716 /* Ensure only single byte characters are set. */
3717 if (dfa->mb_cur_max > 1)
3718 bitset_mask (sbcset, dfa->sb_char);
3719 #endif
3720
3721 /* Build a tree for simple bracket. */
3722 br_token.type = SIMPLE_BRACKET;
3723 br_token.opr.sbcset = sbcset;
3724 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3725 if (BE (tree == NULL, 0))
3726 goto build_word_op_espace;
3727
3728 #ifdef RE_ENABLE_I18N
3729 if (dfa->mb_cur_max > 1)
3730 {
3731 bin_tree_t *mbc_tree;
3732 /* Build a tree for complex bracket. */
3733 br_token.type = COMPLEX_BRACKET;
3734 br_token.opr.mbcset = mbcset;
3735 dfa->has_mb_node = 1;
3736 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3737 if (BE (mbc_tree == NULL, 0))
3738 goto build_word_op_espace;
3739 /* Then join them by ALT node. */
3740 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3741 if (BE (mbc_tree != NULL, 1))
3742 return tree;
3743 }
3744 else
3745 {
3746 free_charset (mbcset);
3747 return tree;
3748 }
3749 #else /* not RE_ENABLE_I18N */
3750 return tree;
3751 #endif /* not RE_ENABLE_I18N */
3752
3753 build_word_op_espace:
3754 re_free (sbcset);
3755 #ifdef RE_ENABLE_I18N
3756 free_charset (mbcset);
3757 #endif /* RE_ENABLE_I18N */
3758 *err = REG_ESPACE;
3759 return NULL;
3760 }
3761
3762 /* This is intended for the expressions like "a{1,3}".
3763 Fetch a number from 'input', and return the number.
3764 Return REG_MISSING if the number field is empty like "{,1}".
3765 Return RE_DUP_MAX + 1 if the number field is too large.
3766 Return REG_ERROR if an error occurred. */
3767
3768 static Idx
3769 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3770 {
3771 Idx num = REG_MISSING;
3772 unsigned char c;
3773 while (1)
3774 {
3775 fetch_token (token, input, syntax);
3776 c = token->opr.c;
3777 if (BE (token->type == END_OF_RE, 0))
3778 return REG_ERROR;
3779 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3780 break;
3781 num = ((token->type != CHARACTER || c < '0' || '9' < c
3782 || num == REG_ERROR)
3783 ? REG_ERROR
3784 : num == REG_MISSING
3785 ? c - '0'
3786 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3787 }
3788 return num;
3789 }
3790 \f
3791 #ifdef RE_ENABLE_I18N
3792 static void
3793 free_charset (re_charset_t *cset)
3794 {
3795 re_free (cset->mbchars);
3796 # ifdef _LIBC
3797 re_free (cset->coll_syms);
3798 re_free (cset->equiv_classes);
3799 re_free (cset->range_starts);
3800 re_free (cset->range_ends);
3801 # endif
3802 re_free (cset->char_classes);
3803 re_free (cset);
3804 }
3805 #endif /* RE_ENABLE_I18N */
3806 \f
3807 /* Functions for binary tree operation. */
3808
3809 /* Create a tree node. */
3810
3811 static bin_tree_t *
3812 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3813 re_token_type_t type)
3814 {
3815 re_token_t t;
3816 t.type = type;
3817 return create_token_tree (dfa, left, right, &t);
3818 }
3819
3820 static bin_tree_t *
3821 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3822 const re_token_t *token)
3823 {
3824 bin_tree_t *tree;
3825 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3826 {
3827 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3828
3829 if (storage == NULL)
3830 return NULL;
3831 storage->next = dfa->str_tree_storage;
3832 dfa->str_tree_storage = storage;
3833 dfa->str_tree_storage_idx = 0;
3834 }
3835 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3836
3837 tree->parent = NULL;
3838 tree->left = left;
3839 tree->right = right;
3840 tree->token = *token;
3841 tree->token.duplicated = 0;
3842 tree->token.opt_subexp = 0;
3843 tree->first = NULL;
3844 tree->next = NULL;
3845 tree->node_idx = REG_MISSING;
3846
3847 if (left != NULL)
3848 left->parent = tree;
3849 if (right != NULL)
3850 right->parent = tree;
3851 return tree;
3852 }
3853
3854 /* Mark the tree SRC as an optional subexpression.
3855 To be called from preorder or postorder. */
3856
3857 static reg_errcode_t
3858 mark_opt_subexp (void *extra, bin_tree_t *node)
3859 {
3860 Idx idx = (uintptr_t) extra;
3861 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3862 node->token.opt_subexp = 1;
3863
3864 return REG_NOERROR;
3865 }
3866
3867 /* Free the allocated memory inside NODE. */
3868
3869 static void
3870 free_token (re_token_t *node)
3871 {
3872 #ifdef RE_ENABLE_I18N
3873 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3874 free_charset (node->opr.mbcset);
3875 else
3876 #endif /* RE_ENABLE_I18N */
3877 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3878 re_free (node->opr.sbcset);
3879 }
3880
3881 /* Worker function for tree walking. Free the allocated memory inside NODE
3882 and its children. */
3883
3884 static reg_errcode_t
3885 free_tree (void *extra, bin_tree_t *node)
3886 {
3887 free_token (&node->token);
3888 return REG_NOERROR;
3889 }
3890
3891
3892 /* Duplicate the node SRC, and return new node. This is a preorder
3893 visit similar to the one implemented by the generic visitor, but
3894 we need more infrastructure to maintain two parallel trees --- so,
3895 it's easier to duplicate. */
3896
3897 static bin_tree_t *
3898 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3899 {
3900 const bin_tree_t *node;
3901 bin_tree_t *dup_root;
3902 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3903
3904 for (node = root; ; )
3905 {
3906 /* Create a new tree and link it back to the current parent. */
3907 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3908 if (*p_new == NULL)
3909 return NULL;
3910 (*p_new)->parent = dup_node;
3911 (*p_new)->token.duplicated = 1;
3912 dup_node = *p_new;
3913
3914 /* Go to the left node, or up and to the right. */
3915 if (node->left)
3916 {
3917 node = node->left;
3918 p_new = &dup_node->left;
3919 }
3920 else
3921 {
3922 const bin_tree_t *prev = NULL;
3923 while (node->right == prev || node->right == NULL)
3924 {
3925 prev = node;
3926 node = node->parent;
3927 dup_node = dup_node->parent;
3928 if (!node)
3929 return dup_root;
3930 }
3931 node = node->right;
3932 p_new = &dup_node->right;
3933 }
3934 }
3935 }