define-module for elisp special modules
[bpt/guile.git] / lib / regcomp.c
1 /* Extended regular expression matching and search library.
2 Copyright (C) 2002-2014 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
21 size_t length, reg_syntax_t syntax);
22 static void re_compile_fastmap_iter (regex_t *bufp,
23 const re_dfastate_t *init_state,
24 char *fastmap);
25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
26 #ifdef RE_ENABLE_I18N
27 static void free_charset (re_charset_t *cset);
28 #endif /* RE_ENABLE_I18N */
29 static void free_workarea_compile (regex_t *preg);
30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
31 #ifdef RE_ENABLE_I18N
32 static void optimize_utf8 (re_dfa_t *dfa);
33 #endif
34 static reg_errcode_t analyze (regex_t *preg);
35 static reg_errcode_t preorder (bin_tree_t *root,
36 reg_errcode_t (fn (void *, bin_tree_t *)),
37 void *extra);
38 static reg_errcode_t postorder (bin_tree_t *root,
39 reg_errcode_t (fn (void *, bin_tree_t *)),
40 void *extra);
41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
44 bin_tree_t *node);
45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
50 unsigned int constraint);
51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
53 Idx node, bool root);
54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
55 static Idx fetch_number (re_string_t *input, re_token_t *token,
56 reg_syntax_t syntax);
57 static int peek_token (re_token_t *token, re_string_t *input,
58 reg_syntax_t syntax) internal_function;
59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
60 reg_syntax_t syntax, reg_errcode_t *err);
61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
62 re_token_t *token, reg_syntax_t syntax,
63 Idx nest, reg_errcode_t *err);
64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
65 re_token_t *token, reg_syntax_t syntax,
66 Idx nest, reg_errcode_t *err);
67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
68 re_token_t *token, reg_syntax_t syntax,
69 Idx nest, reg_errcode_t *err);
70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
71 re_token_t *token, reg_syntax_t syntax,
72 Idx nest, reg_errcode_t *err);
73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
74 re_dfa_t *dfa, re_token_t *token,
75 reg_syntax_t syntax, reg_errcode_t *err);
76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
77 re_token_t *token, reg_syntax_t syntax,
78 reg_errcode_t *err);
79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
80 re_string_t *regexp,
81 re_token_t *token, int token_len,
82 re_dfa_t *dfa,
83 reg_syntax_t syntax,
84 bool accept_hyphen);
85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
86 re_string_t *regexp,
87 re_token_t *token);
88 #ifdef RE_ENABLE_I18N
89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
90 re_charset_t *mbcset,
91 Idx *equiv_class_alloc,
92 const unsigned char *name);
93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
94 bitset_t sbcset,
95 re_charset_t *mbcset,
96 Idx *char_class_alloc,
97 const char *class_name,
98 reg_syntax_t syntax);
99 #else /* not RE_ENABLE_I18N */
100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
101 const unsigned char *name);
102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
103 bitset_t sbcset,
104 const char *class_name,
105 reg_syntax_t syntax);
106 #endif /* not RE_ENABLE_I18N */
107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
108 RE_TRANSLATE_TYPE trans,
109 const char *class_name,
110 const char *extra,
111 bool non_match, reg_errcode_t *err);
112 static bin_tree_t *create_tree (re_dfa_t *dfa,
113 bin_tree_t *left, bin_tree_t *right,
114 re_token_type_t type);
115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
116 bin_tree_t *left, bin_tree_t *right,
117 const re_token_t *token);
118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
119 static void free_token (re_token_t *node);
120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
122 \f
123 /* This table gives an error message for each of the error codes listed
124 in regex.h. Obviously the order here has to be same as there.
125 POSIX doesn't require that we do anything for REG_NOERROR,
126 but why not be nice? */
127
128 static const char __re_error_msgid[] =
129 {
130 #define REG_NOERROR_IDX 0
131 gettext_noop ("Success") /* REG_NOERROR */
132 "\0"
133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
134 gettext_noop ("No match") /* REG_NOMATCH */
135 "\0"
136 #define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
137 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
138 "\0"
139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
140 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
141 "\0"
142 #define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
143 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
144 "\0"
145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
146 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
147 "\0"
148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
149 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
150 "\0"
151 #define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
152 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
153 "\0"
154 #define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
155 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
156 "\0"
157 #define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
158 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
159 "\0"
160 #define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
161 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
162 "\0"
163 #define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
164 gettext_noop ("Invalid range end") /* REG_ERANGE */
165 "\0"
166 #define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
167 gettext_noop ("Memory exhausted") /* REG_ESPACE */
168 "\0"
169 #define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
170 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
171 "\0"
172 #define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
173 gettext_noop ("Premature end of regular expression") /* REG_EEND */
174 "\0"
175 #define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
176 gettext_noop ("Regular expression too big") /* REG_ESIZE */
177 "\0"
178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
179 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
180 };
181
182 static const size_t __re_error_msgid_idx[] =
183 {
184 REG_NOERROR_IDX,
185 REG_NOMATCH_IDX,
186 REG_BADPAT_IDX,
187 REG_ECOLLATE_IDX,
188 REG_ECTYPE_IDX,
189 REG_EESCAPE_IDX,
190 REG_ESUBREG_IDX,
191 REG_EBRACK_IDX,
192 REG_EPAREN_IDX,
193 REG_EBRACE_IDX,
194 REG_BADBR_IDX,
195 REG_ERANGE_IDX,
196 REG_ESPACE_IDX,
197 REG_BADRPT_IDX,
198 REG_EEND_IDX,
199 REG_ESIZE_IDX,
200 REG_ERPAREN_IDX
201 };
202 \f
203 /* Entry points for GNU code. */
204
205 /* re_compile_pattern is the GNU regular expression compiler: it
206 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
207 Returns 0 if the pattern was valid, otherwise an error string.
208
209 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
210 are set in BUFP on entry. */
211
212 #ifdef _LIBC
213 const char *
214 re_compile_pattern (pattern, length, bufp)
215 const char *pattern;
216 size_t length;
217 struct re_pattern_buffer *bufp;
218 #else /* size_t might promote */
219 const char *
220 re_compile_pattern (const char *pattern, size_t length,
221 struct re_pattern_buffer *bufp)
222 #endif
223 {
224 reg_errcode_t ret;
225
226 /* And GNU code determines whether or not to get register information
227 by passing null for the REGS argument to re_match, etc., not by
228 setting no_sub, unless RE_NO_SUB is set. */
229 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
230
231 /* Match anchors at newline. */
232 bufp->newline_anchor = 1;
233
234 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
235
236 if (!ret)
237 return NULL;
238 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
239 }
240 #ifdef _LIBC
241 weak_alias (__re_compile_pattern, re_compile_pattern)
242 #endif
243
244 /* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
245 also be assigned to arbitrarily: each pattern buffer stores its own
246 syntax, so it can be changed between regex compilations. */
247 /* This has no initializer because initialized variables in Emacs
248 become read-only after dumping. */
249 reg_syntax_t re_syntax_options;
250
251
252 /* Specify the precise syntax of regexps for compilation. This provides
253 for compatibility for various utilities which historically have
254 different, incompatible syntaxes.
255
256 The argument SYNTAX is a bit mask comprised of the various bits
257 defined in regex.h. We return the old syntax. */
258
259 reg_syntax_t
260 re_set_syntax (syntax)
261 reg_syntax_t syntax;
262 {
263 reg_syntax_t ret = re_syntax_options;
264
265 re_syntax_options = syntax;
266 return ret;
267 }
268 #ifdef _LIBC
269 weak_alias (__re_set_syntax, re_set_syntax)
270 #endif
271
272 int
273 re_compile_fastmap (bufp)
274 struct re_pattern_buffer *bufp;
275 {
276 re_dfa_t *dfa = bufp->buffer;
277 char *fastmap = bufp->fastmap;
278
279 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
280 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
281 if (dfa->init_state != dfa->init_state_word)
282 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
283 if (dfa->init_state != dfa->init_state_nl)
284 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
285 if (dfa->init_state != dfa->init_state_begbuf)
286 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
287 bufp->fastmap_accurate = 1;
288 return 0;
289 }
290 #ifdef _LIBC
291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
292 #endif
293
294 static inline void
295 __attribute__ ((always_inline))
296 re_set_fastmap (char *fastmap, bool icase, int ch)
297 {
298 fastmap[ch] = 1;
299 if (icase)
300 fastmap[tolower (ch)] = 1;
301 }
302
303 /* Helper function for re_compile_fastmap.
304 Compile fastmap for the initial_state INIT_STATE. */
305
306 static void
307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
308 char *fastmap)
309 {
310 re_dfa_t *dfa = bufp->buffer;
311 Idx node_cnt;
312 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
313 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
314 {
315 Idx node = init_state->nodes.elems[node_cnt];
316 re_token_type_t type = dfa->nodes[node].type;
317
318 if (type == CHARACTER)
319 {
320 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
321 #ifdef RE_ENABLE_I18N
322 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
323 {
324 unsigned char buf[MB_LEN_MAX];
325 unsigned char *p;
326 wchar_t wc;
327 mbstate_t state;
328
329 p = buf;
330 *p++ = dfa->nodes[node].opr.c;
331 while (++node < dfa->nodes_len
332 && dfa->nodes[node].type == CHARACTER
333 && dfa->nodes[node].mb_partial)
334 *p++ = dfa->nodes[node].opr.c;
335 memset (&state, '\0', sizeof (state));
336 if (__mbrtowc (&wc, (const char *) buf, p - buf,
337 &state) == p - buf
338 && (__wcrtomb ((char *) buf, towlower (wc), &state)
339 != (size_t) -1))
340 re_set_fastmap (fastmap, false, buf[0]);
341 }
342 #endif
343 }
344 else if (type == SIMPLE_BRACKET)
345 {
346 int i, ch;
347 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
348 {
349 int j;
350 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
351 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
352 if (w & ((bitset_word_t) 1 << j))
353 re_set_fastmap (fastmap, icase, ch);
354 }
355 }
356 #ifdef RE_ENABLE_I18N
357 else if (type == COMPLEX_BRACKET)
358 {
359 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
360 Idx i;
361
362 # ifdef _LIBC
363 /* See if we have to try all bytes which start multiple collation
364 elements.
365 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
366 collation element, and don't catch 'b' since 'b' is
367 the only collation element which starts from 'b' (and
368 it is caught by SIMPLE_BRACKET). */
369 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
370 && (cset->ncoll_syms || cset->nranges))
371 {
372 const int32_t *table = (const int32_t *)
373 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
374 for (i = 0; i < SBC_MAX; ++i)
375 if (table[i] < 0)
376 re_set_fastmap (fastmap, icase, i);
377 }
378 # endif /* _LIBC */
379
380 /* See if we have to start the match at all multibyte characters,
381 i.e. where we would not find an invalid sequence. This only
382 applies to multibyte character sets; for single byte character
383 sets, the SIMPLE_BRACKET again suffices. */
384 if (dfa->mb_cur_max > 1
385 && (cset->nchar_classes || cset->non_match || cset->nranges
386 # ifdef _LIBC
387 || cset->nequiv_classes
388 # endif /* _LIBC */
389 ))
390 {
391 unsigned char c = 0;
392 do
393 {
394 mbstate_t mbs;
395 memset (&mbs, 0, sizeof (mbs));
396 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
397 re_set_fastmap (fastmap, false, (int) c);
398 }
399 while (++c != 0);
400 }
401
402 else
403 {
404 /* ... Else catch all bytes which can start the mbchars. */
405 for (i = 0; i < cset->nmbchars; ++i)
406 {
407 char buf[256];
408 mbstate_t state;
409 memset (&state, '\0', sizeof (state));
410 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
411 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
412 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
413 {
414 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
415 != (size_t) -1)
416 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
417 }
418 }
419 }
420 }
421 #endif /* RE_ENABLE_I18N */
422 else if (type == OP_PERIOD
423 #ifdef RE_ENABLE_I18N
424 || type == OP_UTF8_PERIOD
425 #endif /* RE_ENABLE_I18N */
426 || type == END_OF_RE)
427 {
428 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
429 if (type == END_OF_RE)
430 bufp->can_be_null = 1;
431 return;
432 }
433 }
434 }
435 \f
436 /* Entry point for POSIX code. */
437 /* regcomp takes a regular expression as a string and compiles it.
438
439 PREG is a regex_t *. We do not expect any fields to be initialized,
440 since POSIX says we shouldn't. Thus, we set
441
442 'buffer' to the compiled pattern;
443 'used' to the length of the compiled pattern;
444 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
445 REG_EXTENDED bit in CFLAGS is set; otherwise, to
446 RE_SYNTAX_POSIX_BASIC;
447 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
448 'fastmap' to an allocated space for the fastmap;
449 'fastmap_accurate' to zero;
450 're_nsub' to the number of subexpressions in PATTERN.
451
452 PATTERN is the address of the pattern string.
453
454 CFLAGS is a series of bits which affect compilation.
455
456 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
457 use POSIX basic syntax.
458
459 If REG_NEWLINE is set, then . and [^...] don't match newline.
460 Also, regexec will try a match beginning after every newline.
461
462 If REG_ICASE is set, then we considers upper- and lowercase
463 versions of letters to be equivalent when matching.
464
465 If REG_NOSUB is set, then when PREG is passed to regexec, that
466 routine will report only success or failure, and nothing about the
467 registers.
468
469 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
470 the return codes and their meanings.) */
471
472 int
473 regcomp (preg, pattern, cflags)
474 regex_t *_Restrict_ preg;
475 const char *_Restrict_ pattern;
476 int cflags;
477 {
478 reg_errcode_t ret;
479 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
480 : RE_SYNTAX_POSIX_BASIC);
481
482 preg->buffer = NULL;
483 preg->allocated = 0;
484 preg->used = 0;
485
486 /* Try to allocate space for the fastmap. */
487 preg->fastmap = re_malloc (char, SBC_MAX);
488 if (BE (preg->fastmap == NULL, 0))
489 return REG_ESPACE;
490
491 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
492
493 /* If REG_NEWLINE is set, newlines are treated differently. */
494 if (cflags & REG_NEWLINE)
495 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
496 syntax &= ~RE_DOT_NEWLINE;
497 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
498 /* It also changes the matching behavior. */
499 preg->newline_anchor = 1;
500 }
501 else
502 preg->newline_anchor = 0;
503 preg->no_sub = !!(cflags & REG_NOSUB);
504 preg->translate = NULL;
505
506 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
507
508 /* POSIX doesn't distinguish between an unmatched open-group and an
509 unmatched close-group: both are REG_EPAREN. */
510 if (ret == REG_ERPAREN)
511 ret = REG_EPAREN;
512
513 /* We have already checked preg->fastmap != NULL. */
514 if (BE (ret == REG_NOERROR, 1))
515 /* Compute the fastmap now, since regexec cannot modify the pattern
516 buffer. This function never fails in this implementation. */
517 (void) re_compile_fastmap (preg);
518 else
519 {
520 /* Some error occurred while compiling the expression. */
521 re_free (preg->fastmap);
522 preg->fastmap = NULL;
523 }
524
525 return (int) ret;
526 }
527 #ifdef _LIBC
528 weak_alias (__regcomp, regcomp)
529 #endif
530
531 /* Returns a message corresponding to an error code, ERRCODE, returned
532 from either regcomp or regexec. We don't use PREG here. */
533
534 #ifdef _LIBC
535 size_t
536 regerror (errcode, preg, errbuf, errbuf_size)
537 int errcode;
538 const regex_t *_Restrict_ preg;
539 char *_Restrict_ errbuf;
540 size_t errbuf_size;
541 #else /* size_t might promote */
542 size_t
543 regerror (int errcode, const regex_t *_Restrict_ preg,
544 char *_Restrict_ errbuf, size_t errbuf_size)
545 #endif
546 {
547 const char *msg;
548 size_t msg_size;
549
550 if (BE (errcode < 0
551 || errcode >= (int) (sizeof (__re_error_msgid_idx)
552 / sizeof (__re_error_msgid_idx[0])), 0))
553 /* Only error codes returned by the rest of the code should be passed
554 to this routine. If we are given anything else, or if other regex
555 code generates an invalid error code, then the program has a bug.
556 Dump core so we can fix it. */
557 abort ();
558
559 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
560
561 msg_size = strlen (msg) + 1; /* Includes the null. */
562
563 if (BE (errbuf_size != 0, 1))
564 {
565 size_t cpy_size = msg_size;
566 if (BE (msg_size > errbuf_size, 0))
567 {
568 cpy_size = errbuf_size - 1;
569 errbuf[cpy_size] = '\0';
570 }
571 memcpy (errbuf, msg, cpy_size);
572 }
573
574 return msg_size;
575 }
576 #ifdef _LIBC
577 weak_alias (__regerror, regerror)
578 #endif
579
580
581 #ifdef RE_ENABLE_I18N
582 /* This static array is used for the map to single-byte characters when
583 UTF-8 is used. Otherwise we would allocate memory just to initialize
584 it the same all the time. UTF-8 is the preferred encoding so this is
585 a worthwhile optimization. */
586 static const bitset_t utf8_sb_map =
587 {
588 /* Set the first 128 bits. */
589 # if defined __GNUC__ && !defined __STRICT_ANSI__
590 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
591 # else
592 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
593 # error "bitset_word_t is narrower than 32 bits"
594 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
595 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
596 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
597 BITSET_WORD_MAX, BITSET_WORD_MAX,
598 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
599 BITSET_WORD_MAX,
600 # endif
601 (BITSET_WORD_MAX
602 >> (SBC_MAX % BITSET_WORD_BITS == 0
603 ? 0
604 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
605 # endif
606 };
607 #endif
608
609
610 static void
611 free_dfa_content (re_dfa_t *dfa)
612 {
613 Idx i, j;
614
615 if (dfa->nodes)
616 for (i = 0; i < dfa->nodes_len; ++i)
617 free_token (dfa->nodes + i);
618 re_free (dfa->nexts);
619 for (i = 0; i < dfa->nodes_len; ++i)
620 {
621 if (dfa->eclosures != NULL)
622 re_node_set_free (dfa->eclosures + i);
623 if (dfa->inveclosures != NULL)
624 re_node_set_free (dfa->inveclosures + i);
625 if (dfa->edests != NULL)
626 re_node_set_free (dfa->edests + i);
627 }
628 re_free (dfa->edests);
629 re_free (dfa->eclosures);
630 re_free (dfa->inveclosures);
631 re_free (dfa->nodes);
632
633 if (dfa->state_table)
634 for (i = 0; i <= dfa->state_hash_mask; ++i)
635 {
636 struct re_state_table_entry *entry = dfa->state_table + i;
637 for (j = 0; j < entry->num; ++j)
638 {
639 re_dfastate_t *state = entry->array[j];
640 free_state (state);
641 }
642 re_free (entry->array);
643 }
644 re_free (dfa->state_table);
645 #ifdef RE_ENABLE_I18N
646 if (dfa->sb_char != utf8_sb_map)
647 re_free (dfa->sb_char);
648 #endif
649 re_free (dfa->subexp_map);
650 #ifdef DEBUG
651 re_free (dfa->re_str);
652 #endif
653
654 re_free (dfa);
655 }
656
657
658 /* Free dynamically allocated space used by PREG. */
659
660 void
661 regfree (preg)
662 regex_t *preg;
663 {
664 re_dfa_t *dfa = preg->buffer;
665 if (BE (dfa != NULL, 1))
666 {
667 lock_fini (dfa->lock);
668 free_dfa_content (dfa);
669 }
670 preg->buffer = NULL;
671 preg->allocated = 0;
672
673 re_free (preg->fastmap);
674 preg->fastmap = NULL;
675
676 re_free (preg->translate);
677 preg->translate = NULL;
678 }
679 #ifdef _LIBC
680 weak_alias (__regfree, regfree)
681 #endif
682 \f
683 /* Entry points compatible with 4.2 BSD regex library. We don't define
684 them unless specifically requested. */
685
686 #if defined _REGEX_RE_COMP || defined _LIBC
687
688 /* BSD has one and only one pattern buffer. */
689 static struct re_pattern_buffer re_comp_buf;
690
691 char *
692 # ifdef _LIBC
693 /* Make these definitions weak in libc, so POSIX programs can redefine
694 these names if they don't use our functions, and still use
695 regcomp/regexec above without link errors. */
696 weak_function
697 # endif
698 re_comp (s)
699 const char *s;
700 {
701 reg_errcode_t ret;
702 char *fastmap;
703
704 if (!s)
705 {
706 if (!re_comp_buf.buffer)
707 return gettext ("No previous regular expression");
708 return 0;
709 }
710
711 if (re_comp_buf.buffer)
712 {
713 fastmap = re_comp_buf.fastmap;
714 re_comp_buf.fastmap = NULL;
715 __regfree (&re_comp_buf);
716 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
717 re_comp_buf.fastmap = fastmap;
718 }
719
720 if (re_comp_buf.fastmap == NULL)
721 {
722 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
723 if (re_comp_buf.fastmap == NULL)
724 return (char *) gettext (__re_error_msgid
725 + __re_error_msgid_idx[(int) REG_ESPACE]);
726 }
727
728 /* Since 're_exec' always passes NULL for the 'regs' argument, we
729 don't need to initialize the pattern buffer fields which affect it. */
730
731 /* Match anchors at newlines. */
732 re_comp_buf.newline_anchor = 1;
733
734 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
735
736 if (!ret)
737 return NULL;
738
739 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
740 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
741 }
742
743 #ifdef _LIBC
744 libc_freeres_fn (free_mem)
745 {
746 __regfree (&re_comp_buf);
747 }
748 #endif
749
750 #endif /* _REGEX_RE_COMP */
751 \f
752 /* Internal entry point.
753 Compile the regular expression PATTERN, whose length is LENGTH.
754 SYNTAX indicate regular expression's syntax. */
755
756 static reg_errcode_t
757 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
758 reg_syntax_t syntax)
759 {
760 reg_errcode_t err = REG_NOERROR;
761 re_dfa_t *dfa;
762 re_string_t regexp;
763
764 /* Initialize the pattern buffer. */
765 preg->fastmap_accurate = 0;
766 preg->syntax = syntax;
767 preg->not_bol = preg->not_eol = 0;
768 preg->used = 0;
769 preg->re_nsub = 0;
770 preg->can_be_null = 0;
771 preg->regs_allocated = REGS_UNALLOCATED;
772
773 /* Initialize the dfa. */
774 dfa = preg->buffer;
775 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
776 {
777 /* If zero allocated, but buffer is non-null, try to realloc
778 enough space. This loses if buffer's address is bogus, but
779 that is the user's responsibility. If ->buffer is NULL this
780 is a simple allocation. */
781 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
782 if (dfa == NULL)
783 return REG_ESPACE;
784 preg->allocated = sizeof (re_dfa_t);
785 preg->buffer = dfa;
786 }
787 preg->used = sizeof (re_dfa_t);
788
789 err = init_dfa (dfa, length);
790 if (BE (err == REG_NOERROR && lock_init (dfa->lock) != 0, 0))
791 err = REG_ESPACE;
792 if (BE (err != REG_NOERROR, 0))
793 {
794 free_dfa_content (dfa);
795 preg->buffer = NULL;
796 preg->allocated = 0;
797 return err;
798 }
799 #ifdef DEBUG
800 /* Note: length+1 will not overflow since it is checked in init_dfa. */
801 dfa->re_str = re_malloc (char, length + 1);
802 strncpy (dfa->re_str, pattern, length + 1);
803 #endif
804
805 err = re_string_construct (&regexp, pattern, length, preg->translate,
806 (syntax & RE_ICASE) != 0, dfa);
807 if (BE (err != REG_NOERROR, 0))
808 {
809 re_compile_internal_free_return:
810 free_workarea_compile (preg);
811 re_string_destruct (&regexp);
812 lock_fini (dfa->lock);
813 free_dfa_content (dfa);
814 preg->buffer = NULL;
815 preg->allocated = 0;
816 return err;
817 }
818
819 /* Parse the regular expression, and build a structure tree. */
820 preg->re_nsub = 0;
821 dfa->str_tree = parse (&regexp, preg, syntax, &err);
822 if (BE (dfa->str_tree == NULL, 0))
823 goto re_compile_internal_free_return;
824
825 /* Analyze the tree and create the nfa. */
826 err = analyze (preg);
827 if (BE (err != REG_NOERROR, 0))
828 goto re_compile_internal_free_return;
829
830 #ifdef RE_ENABLE_I18N
831 /* If possible, do searching in single byte encoding to speed things up. */
832 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
833 optimize_utf8 (dfa);
834 #endif
835
836 /* Then create the initial state of the dfa. */
837 err = create_initial_state (dfa);
838
839 /* Release work areas. */
840 free_workarea_compile (preg);
841 re_string_destruct (&regexp);
842
843 if (BE (err != REG_NOERROR, 0))
844 {
845 lock_fini (dfa->lock);
846 free_dfa_content (dfa);
847 preg->buffer = NULL;
848 preg->allocated = 0;
849 }
850
851 return err;
852 }
853
854 /* Initialize DFA. We use the length of the regular expression PAT_LEN
855 as the initial length of some arrays. */
856
857 static reg_errcode_t
858 init_dfa (re_dfa_t *dfa, size_t pat_len)
859 {
860 __re_size_t table_size;
861 #ifndef _LIBC
862 const char *codeset_name;
863 #endif
864 #ifdef RE_ENABLE_I18N
865 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
866 #else
867 size_t max_i18n_object_size = 0;
868 #endif
869 size_t max_object_size =
870 MAX (sizeof (struct re_state_table_entry),
871 MAX (sizeof (re_token_t),
872 MAX (sizeof (re_node_set),
873 MAX (sizeof (regmatch_t),
874 max_i18n_object_size))));
875
876 memset (dfa, '\0', sizeof (re_dfa_t));
877
878 /* Force allocation of str_tree_storage the first time. */
879 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
880
881 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
882 calculation below, and for similar doubling calculations
883 elsewhere. And it's <= rather than <, because some of the
884 doubling calculations add 1 afterwards. */
885 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
886 return REG_ESPACE;
887
888 dfa->nodes_alloc = pat_len + 1;
889 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
890
891 /* table_size = 2 ^ ceil(log pat_len) */
892 for (table_size = 1; ; table_size <<= 1)
893 if (table_size > pat_len)
894 break;
895
896 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
897 dfa->state_hash_mask = table_size - 1;
898
899 dfa->mb_cur_max = MB_CUR_MAX;
900 #ifdef _LIBC
901 if (dfa->mb_cur_max == 6
902 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
903 dfa->is_utf8 = 1;
904 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
905 != 0);
906 #else
907 codeset_name = nl_langinfo (CODESET);
908 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
909 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
910 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
911 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
912 dfa->is_utf8 = 1;
913
914 /* We check exhaustively in the loop below if this charset is a
915 superset of ASCII. */
916 dfa->map_notascii = 0;
917 #endif
918
919 #ifdef RE_ENABLE_I18N
920 if (dfa->mb_cur_max > 1)
921 {
922 if (dfa->is_utf8)
923 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
924 else
925 {
926 int i, j, ch;
927
928 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
929 if (BE (dfa->sb_char == NULL, 0))
930 return REG_ESPACE;
931
932 /* Set the bits corresponding to single byte chars. */
933 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
934 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
935 {
936 wint_t wch = __btowc (ch);
937 if (wch != WEOF)
938 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
939 # ifndef _LIBC
940 if (isascii (ch) && wch != ch)
941 dfa->map_notascii = 1;
942 # endif
943 }
944 }
945 }
946 #endif
947
948 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
949 return REG_ESPACE;
950 return REG_NOERROR;
951 }
952
953 /* Initialize WORD_CHAR table, which indicate which character is
954 "word". In this case "word" means that it is the word construction
955 character used by some operators like "\<", "\>", etc. */
956
957 static void
958 internal_function
959 init_word_char (re_dfa_t *dfa)
960 {
961 int i = 0;
962 int j;
963 int ch = 0;
964 dfa->word_ops_used = 1;
965 if (BE (dfa->map_notascii == 0, 1))
966 {
967 bitset_word_t bits0 = 0x00000000;
968 bitset_word_t bits1 = 0x03ff0000;
969 bitset_word_t bits2 = 0x87fffffe;
970 bitset_word_t bits3 = 0x07fffffe;
971 if (BITSET_WORD_BITS == 64)
972 {
973 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
974 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
975 i = 2;
976 }
977 else if (BITSET_WORD_BITS == 32)
978 {
979 dfa->word_char[0] = bits0;
980 dfa->word_char[1] = bits1;
981 dfa->word_char[2] = bits2;
982 dfa->word_char[3] = bits3;
983 i = 4;
984 }
985 else
986 goto general_case;
987 ch = 128;
988
989 if (BE (dfa->is_utf8, 1))
990 {
991 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
992 return;
993 }
994 }
995
996 general_case:
997 for (; i < BITSET_WORDS; ++i)
998 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
999 if (isalnum (ch) || ch == '_')
1000 dfa->word_char[i] |= (bitset_word_t) 1 << j;
1001 }
1002
1003 /* Free the work area which are only used while compiling. */
1004
1005 static void
1006 free_workarea_compile (regex_t *preg)
1007 {
1008 re_dfa_t *dfa = preg->buffer;
1009 bin_tree_storage_t *storage, *next;
1010 for (storage = dfa->str_tree_storage; storage; storage = next)
1011 {
1012 next = storage->next;
1013 re_free (storage);
1014 }
1015 dfa->str_tree_storage = NULL;
1016 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1017 dfa->str_tree = NULL;
1018 re_free (dfa->org_indices);
1019 dfa->org_indices = NULL;
1020 }
1021
1022 /* Create initial states for all contexts. */
1023
1024 static reg_errcode_t
1025 create_initial_state (re_dfa_t *dfa)
1026 {
1027 Idx first, i;
1028 reg_errcode_t err;
1029 re_node_set init_nodes;
1030
1031 /* Initial states have the epsilon closure of the node which is
1032 the first node of the regular expression. */
1033 first = dfa->str_tree->first->node_idx;
1034 dfa->init_node = first;
1035 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1036 if (BE (err != REG_NOERROR, 0))
1037 return err;
1038
1039 /* The back-references which are in initial states can epsilon transit,
1040 since in this case all of the subexpressions can be null.
1041 Then we add epsilon closures of the nodes which are the next nodes of
1042 the back-references. */
1043 if (dfa->nbackref > 0)
1044 for (i = 0; i < init_nodes.nelem; ++i)
1045 {
1046 Idx node_idx = init_nodes.elems[i];
1047 re_token_type_t type = dfa->nodes[node_idx].type;
1048
1049 Idx clexp_idx;
1050 if (type != OP_BACK_REF)
1051 continue;
1052 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1053 {
1054 re_token_t *clexp_node;
1055 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1056 if (clexp_node->type == OP_CLOSE_SUBEXP
1057 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1058 break;
1059 }
1060 if (clexp_idx == init_nodes.nelem)
1061 continue;
1062
1063 if (type == OP_BACK_REF)
1064 {
1065 Idx dest_idx = dfa->edests[node_idx].elems[0];
1066 if (!re_node_set_contains (&init_nodes, dest_idx))
1067 {
1068 reg_errcode_t merge_err
1069 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1070 if (merge_err != REG_NOERROR)
1071 return merge_err;
1072 i = 0;
1073 }
1074 }
1075 }
1076
1077 /* It must be the first time to invoke acquire_state. */
1078 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1079 /* We don't check ERR here, since the initial state must not be NULL. */
1080 if (BE (dfa->init_state == NULL, 0))
1081 return err;
1082 if (dfa->init_state->has_constraint)
1083 {
1084 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1085 CONTEXT_WORD);
1086 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1087 CONTEXT_NEWLINE);
1088 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1089 &init_nodes,
1090 CONTEXT_NEWLINE
1091 | CONTEXT_BEGBUF);
1092 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1093 || dfa->init_state_begbuf == NULL, 0))
1094 return err;
1095 }
1096 else
1097 dfa->init_state_word = dfa->init_state_nl
1098 = dfa->init_state_begbuf = dfa->init_state;
1099
1100 re_node_set_free (&init_nodes);
1101 return REG_NOERROR;
1102 }
1103 \f
1104 #ifdef RE_ENABLE_I18N
1105 /* If it is possible to do searching in single byte encoding instead of UTF-8
1106 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1107 DFA nodes where needed. */
1108
1109 static void
1110 optimize_utf8 (re_dfa_t *dfa)
1111 {
1112 Idx node;
1113 int i;
1114 bool mb_chars = false;
1115 bool has_period = false;
1116
1117 for (node = 0; node < dfa->nodes_len; ++node)
1118 switch (dfa->nodes[node].type)
1119 {
1120 case CHARACTER:
1121 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1122 mb_chars = true;
1123 break;
1124 case ANCHOR:
1125 switch (dfa->nodes[node].opr.ctx_type)
1126 {
1127 case LINE_FIRST:
1128 case LINE_LAST:
1129 case BUF_FIRST:
1130 case BUF_LAST:
1131 break;
1132 default:
1133 /* Word anchors etc. cannot be handled. It's okay to test
1134 opr.ctx_type since constraints (for all DFA nodes) are
1135 created by ORing one or more opr.ctx_type values. */
1136 return;
1137 }
1138 break;
1139 case OP_PERIOD:
1140 has_period = true;
1141 break;
1142 case OP_BACK_REF:
1143 case OP_ALT:
1144 case END_OF_RE:
1145 case OP_DUP_ASTERISK:
1146 case OP_OPEN_SUBEXP:
1147 case OP_CLOSE_SUBEXP:
1148 break;
1149 case COMPLEX_BRACKET:
1150 return;
1151 case SIMPLE_BRACKET:
1152 /* Just double check. */
1153 {
1154 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1155 ? 0
1156 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1157 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1158 {
1159 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1160 return;
1161 rshift = 0;
1162 }
1163 }
1164 break;
1165 default:
1166 abort ();
1167 }
1168
1169 if (mb_chars || has_period)
1170 for (node = 0; node < dfa->nodes_len; ++node)
1171 {
1172 if (dfa->nodes[node].type == CHARACTER
1173 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1174 dfa->nodes[node].mb_partial = 0;
1175 else if (dfa->nodes[node].type == OP_PERIOD)
1176 dfa->nodes[node].type = OP_UTF8_PERIOD;
1177 }
1178
1179 /* The search can be in single byte locale. */
1180 dfa->mb_cur_max = 1;
1181 dfa->is_utf8 = 0;
1182 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1183 }
1184 #endif
1185 \f
1186 /* Analyze the structure tree, and calculate "first", "next", "edest",
1187 "eclosure", and "inveclosure". */
1188
1189 static reg_errcode_t
1190 analyze (regex_t *preg)
1191 {
1192 re_dfa_t *dfa = preg->buffer;
1193 reg_errcode_t ret;
1194
1195 /* Allocate arrays. */
1196 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1197 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1198 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1199 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1200 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1201 || dfa->eclosures == NULL, 0))
1202 return REG_ESPACE;
1203
1204 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1205 if (dfa->subexp_map != NULL)
1206 {
1207 Idx i;
1208 for (i = 0; i < preg->re_nsub; i++)
1209 dfa->subexp_map[i] = i;
1210 preorder (dfa->str_tree, optimize_subexps, dfa);
1211 for (i = 0; i < preg->re_nsub; i++)
1212 if (dfa->subexp_map[i] != i)
1213 break;
1214 if (i == preg->re_nsub)
1215 {
1216 free (dfa->subexp_map);
1217 dfa->subexp_map = NULL;
1218 }
1219 }
1220
1221 ret = postorder (dfa->str_tree, lower_subexps, preg);
1222 if (BE (ret != REG_NOERROR, 0))
1223 return ret;
1224 ret = postorder (dfa->str_tree, calc_first, dfa);
1225 if (BE (ret != REG_NOERROR, 0))
1226 return ret;
1227 preorder (dfa->str_tree, calc_next, dfa);
1228 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1229 if (BE (ret != REG_NOERROR, 0))
1230 return ret;
1231 ret = calc_eclosure (dfa);
1232 if (BE (ret != REG_NOERROR, 0))
1233 return ret;
1234
1235 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1236 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1237 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1238 || dfa->nbackref)
1239 {
1240 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1241 if (BE (dfa->inveclosures == NULL, 0))
1242 return REG_ESPACE;
1243 ret = calc_inveclosure (dfa);
1244 }
1245
1246 return ret;
1247 }
1248
1249 /* Our parse trees are very unbalanced, so we cannot use a stack to
1250 implement parse tree visits. Instead, we use parent pointers and
1251 some hairy code in these two functions. */
1252 static reg_errcode_t
1253 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1254 void *extra)
1255 {
1256 bin_tree_t *node, *prev;
1257
1258 for (node = root; ; )
1259 {
1260 /* Descend down the tree, preferably to the left (or to the right
1261 if that's the only child). */
1262 while (node->left || node->right)
1263 if (node->left)
1264 node = node->left;
1265 else
1266 node = node->right;
1267
1268 do
1269 {
1270 reg_errcode_t err = fn (extra, node);
1271 if (BE (err != REG_NOERROR, 0))
1272 return err;
1273 if (node->parent == NULL)
1274 return REG_NOERROR;
1275 prev = node;
1276 node = node->parent;
1277 }
1278 /* Go up while we have a node that is reached from the right. */
1279 while (node->right == prev || node->right == NULL);
1280 node = node->right;
1281 }
1282 }
1283
1284 static reg_errcode_t
1285 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1286 void *extra)
1287 {
1288 bin_tree_t *node;
1289
1290 for (node = root; ; )
1291 {
1292 reg_errcode_t err = fn (extra, node);
1293 if (BE (err != REG_NOERROR, 0))
1294 return err;
1295
1296 /* Go to the left node, or up and to the right. */
1297 if (node->left)
1298 node = node->left;
1299 else
1300 {
1301 bin_tree_t *prev = NULL;
1302 while (node->right == prev || node->right == NULL)
1303 {
1304 prev = node;
1305 node = node->parent;
1306 if (!node)
1307 return REG_NOERROR;
1308 }
1309 node = node->right;
1310 }
1311 }
1312 }
1313
1314 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1315 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1316 backreferences as well. Requires a preorder visit. */
1317 static reg_errcode_t
1318 optimize_subexps (void *extra, bin_tree_t *node)
1319 {
1320 re_dfa_t *dfa = (re_dfa_t *) extra;
1321
1322 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1323 {
1324 int idx = node->token.opr.idx;
1325 node->token.opr.idx = dfa->subexp_map[idx];
1326 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1327 }
1328
1329 else if (node->token.type == SUBEXP
1330 && node->left && node->left->token.type == SUBEXP)
1331 {
1332 Idx other_idx = node->left->token.opr.idx;
1333
1334 node->left = node->left->left;
1335 if (node->left)
1336 node->left->parent = node;
1337
1338 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1339 if (other_idx < BITSET_WORD_BITS)
1340 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1341 }
1342
1343 return REG_NOERROR;
1344 }
1345
1346 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1347 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1348 static reg_errcode_t
1349 lower_subexps (void *extra, bin_tree_t *node)
1350 {
1351 regex_t *preg = (regex_t *) extra;
1352 reg_errcode_t err = REG_NOERROR;
1353
1354 if (node->left && node->left->token.type == SUBEXP)
1355 {
1356 node->left = lower_subexp (&err, preg, node->left);
1357 if (node->left)
1358 node->left->parent = node;
1359 }
1360 if (node->right && node->right->token.type == SUBEXP)
1361 {
1362 node->right = lower_subexp (&err, preg, node->right);
1363 if (node->right)
1364 node->right->parent = node;
1365 }
1366
1367 return err;
1368 }
1369
1370 static bin_tree_t *
1371 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1372 {
1373 re_dfa_t *dfa = preg->buffer;
1374 bin_tree_t *body = node->left;
1375 bin_tree_t *op, *cls, *tree1, *tree;
1376
1377 if (preg->no_sub
1378 /* We do not optimize empty subexpressions, because otherwise we may
1379 have bad CONCAT nodes with NULL children. This is obviously not
1380 very common, so we do not lose much. An example that triggers
1381 this case is the sed "script" /\(\)/x. */
1382 && node->left != NULL
1383 && (node->token.opr.idx >= BITSET_WORD_BITS
1384 || !(dfa->used_bkref_map
1385 & ((bitset_word_t) 1 << node->token.opr.idx))))
1386 return node->left;
1387
1388 /* Convert the SUBEXP node to the concatenation of an
1389 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1390 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1391 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1392 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1393 tree = create_tree (dfa, op, tree1, CONCAT);
1394 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1395 {
1396 *err = REG_ESPACE;
1397 return NULL;
1398 }
1399
1400 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1401 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1402 return tree;
1403 }
1404
1405 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1406 nodes. Requires a postorder visit. */
1407 static reg_errcode_t
1408 calc_first (void *extra, bin_tree_t *node)
1409 {
1410 re_dfa_t *dfa = (re_dfa_t *) extra;
1411 if (node->token.type == CONCAT)
1412 {
1413 node->first = node->left->first;
1414 node->node_idx = node->left->node_idx;
1415 }
1416 else
1417 {
1418 node->first = node;
1419 node->node_idx = re_dfa_add_node (dfa, node->token);
1420 if (BE (node->node_idx == REG_MISSING, 0))
1421 return REG_ESPACE;
1422 if (node->token.type == ANCHOR)
1423 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1424 }
1425 return REG_NOERROR;
1426 }
1427
1428 /* Pass 2: compute NEXT on the tree. Preorder visit. */
1429 static reg_errcode_t
1430 calc_next (void *extra, bin_tree_t *node)
1431 {
1432 switch (node->token.type)
1433 {
1434 case OP_DUP_ASTERISK:
1435 node->left->next = node;
1436 break;
1437 case CONCAT:
1438 node->left->next = node->right->first;
1439 node->right->next = node->next;
1440 break;
1441 default:
1442 if (node->left)
1443 node->left->next = node->next;
1444 if (node->right)
1445 node->right->next = node->next;
1446 break;
1447 }
1448 return REG_NOERROR;
1449 }
1450
1451 /* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1452 static reg_errcode_t
1453 link_nfa_nodes (void *extra, bin_tree_t *node)
1454 {
1455 re_dfa_t *dfa = (re_dfa_t *) extra;
1456 Idx idx = node->node_idx;
1457 reg_errcode_t err = REG_NOERROR;
1458
1459 switch (node->token.type)
1460 {
1461 case CONCAT:
1462 break;
1463
1464 case END_OF_RE:
1465 assert (node->next == NULL);
1466 break;
1467
1468 case OP_DUP_ASTERISK:
1469 case OP_ALT:
1470 {
1471 Idx left, right;
1472 dfa->has_plural_match = 1;
1473 if (node->left != NULL)
1474 left = node->left->first->node_idx;
1475 else
1476 left = node->next->node_idx;
1477 if (node->right != NULL)
1478 right = node->right->first->node_idx;
1479 else
1480 right = node->next->node_idx;
1481 assert (REG_VALID_INDEX (left));
1482 assert (REG_VALID_INDEX (right));
1483 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1484 }
1485 break;
1486
1487 case ANCHOR:
1488 case OP_OPEN_SUBEXP:
1489 case OP_CLOSE_SUBEXP:
1490 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1491 break;
1492
1493 case OP_BACK_REF:
1494 dfa->nexts[idx] = node->next->node_idx;
1495 if (node->token.type == OP_BACK_REF)
1496 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1497 break;
1498
1499 default:
1500 assert (!IS_EPSILON_NODE (node->token.type));
1501 dfa->nexts[idx] = node->next->node_idx;
1502 break;
1503 }
1504
1505 return err;
1506 }
1507
1508 /* Duplicate the epsilon closure of the node ROOT_NODE.
1509 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1510 to their own constraint. */
1511
1512 static reg_errcode_t
1513 internal_function
1514 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1515 Idx root_node, unsigned int init_constraint)
1516 {
1517 Idx org_node, clone_node;
1518 bool ok;
1519 unsigned int constraint = init_constraint;
1520 for (org_node = top_org_node, clone_node = top_clone_node;;)
1521 {
1522 Idx org_dest, clone_dest;
1523 if (dfa->nodes[org_node].type == OP_BACK_REF)
1524 {
1525 /* If the back reference epsilon-transit, its destination must
1526 also have the constraint. Then duplicate the epsilon closure
1527 of the destination of the back reference, and store it in
1528 edests of the back reference. */
1529 org_dest = dfa->nexts[org_node];
1530 re_node_set_empty (dfa->edests + clone_node);
1531 clone_dest = duplicate_node (dfa, org_dest, constraint);
1532 if (BE (clone_dest == REG_MISSING, 0))
1533 return REG_ESPACE;
1534 dfa->nexts[clone_node] = dfa->nexts[org_node];
1535 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536 if (BE (! ok, 0))
1537 return REG_ESPACE;
1538 }
1539 else if (dfa->edests[org_node].nelem == 0)
1540 {
1541 /* In case of the node can't epsilon-transit, don't duplicate the
1542 destination and store the original destination as the
1543 destination of the node. */
1544 dfa->nexts[clone_node] = dfa->nexts[org_node];
1545 break;
1546 }
1547 else if (dfa->edests[org_node].nelem == 1)
1548 {
1549 /* In case of the node can epsilon-transit, and it has only one
1550 destination. */
1551 org_dest = dfa->edests[org_node].elems[0];
1552 re_node_set_empty (dfa->edests + clone_node);
1553 /* If the node is root_node itself, it means the epsilon closure
1554 has a loop. Then tie it to the destination of the root_node. */
1555 if (org_node == root_node && clone_node != org_node)
1556 {
1557 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1558 if (BE (! ok, 0))
1559 return REG_ESPACE;
1560 break;
1561 }
1562 /* In case the node has another constraint, append it. */
1563 constraint |= dfa->nodes[org_node].constraint;
1564 clone_dest = duplicate_node (dfa, org_dest, constraint);
1565 if (BE (clone_dest == REG_MISSING, 0))
1566 return REG_ESPACE;
1567 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1568 if (BE (! ok, 0))
1569 return REG_ESPACE;
1570 }
1571 else /* dfa->edests[org_node].nelem == 2 */
1572 {
1573 /* In case of the node can epsilon-transit, and it has two
1574 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1575 org_dest = dfa->edests[org_node].elems[0];
1576 re_node_set_empty (dfa->edests + clone_node);
1577 /* Search for a duplicated node which satisfies the constraint. */
1578 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1579 if (clone_dest == REG_MISSING)
1580 {
1581 /* There is no such duplicated node, create a new one. */
1582 reg_errcode_t err;
1583 clone_dest = duplicate_node (dfa, org_dest, constraint);
1584 if (BE (clone_dest == REG_MISSING, 0))
1585 return REG_ESPACE;
1586 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1587 if (BE (! ok, 0))
1588 return REG_ESPACE;
1589 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1590 root_node, constraint);
1591 if (BE (err != REG_NOERROR, 0))
1592 return err;
1593 }
1594 else
1595 {
1596 /* There is a duplicated node which satisfies the constraint,
1597 use it to avoid infinite loop. */
1598 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1599 if (BE (! ok, 0))
1600 return REG_ESPACE;
1601 }
1602
1603 org_dest = dfa->edests[org_node].elems[1];
1604 clone_dest = duplicate_node (dfa, org_dest, constraint);
1605 if (BE (clone_dest == REG_MISSING, 0))
1606 return REG_ESPACE;
1607 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1608 if (BE (! ok, 0))
1609 return REG_ESPACE;
1610 }
1611 org_node = org_dest;
1612 clone_node = clone_dest;
1613 }
1614 return REG_NOERROR;
1615 }
1616
1617 /* Search for a node which is duplicated from the node ORG_NODE, and
1618 satisfies the constraint CONSTRAINT. */
1619
1620 static Idx
1621 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1622 unsigned int constraint)
1623 {
1624 Idx idx;
1625 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1626 {
1627 if (org_node == dfa->org_indices[idx]
1628 && constraint == dfa->nodes[idx].constraint)
1629 return idx; /* Found. */
1630 }
1631 return REG_MISSING; /* Not found. */
1632 }
1633
1634 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1635 Return the index of the new node, or REG_MISSING if insufficient storage is
1636 available. */
1637
1638 static Idx
1639 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1640 {
1641 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1642 if (BE (dup_idx != REG_MISSING, 1))
1643 {
1644 dfa->nodes[dup_idx].constraint = constraint;
1645 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1646 dfa->nodes[dup_idx].duplicated = 1;
1647
1648 /* Store the index of the original node. */
1649 dfa->org_indices[dup_idx] = org_idx;
1650 }
1651 return dup_idx;
1652 }
1653
1654 static reg_errcode_t
1655 calc_inveclosure (re_dfa_t *dfa)
1656 {
1657 Idx src, idx;
1658 bool ok;
1659 for (idx = 0; idx < dfa->nodes_len; ++idx)
1660 re_node_set_init_empty (dfa->inveclosures + idx);
1661
1662 for (src = 0; src < dfa->nodes_len; ++src)
1663 {
1664 Idx *elems = dfa->eclosures[src].elems;
1665 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1666 {
1667 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1668 if (BE (! ok, 0))
1669 return REG_ESPACE;
1670 }
1671 }
1672
1673 return REG_NOERROR;
1674 }
1675
1676 /* Calculate "eclosure" for all the node in DFA. */
1677
1678 static reg_errcode_t
1679 calc_eclosure (re_dfa_t *dfa)
1680 {
1681 Idx node_idx;
1682 bool incomplete;
1683 #ifdef DEBUG
1684 assert (dfa->nodes_len > 0);
1685 #endif
1686 incomplete = false;
1687 /* For each nodes, calculate epsilon closure. */
1688 for (node_idx = 0; ; ++node_idx)
1689 {
1690 reg_errcode_t err;
1691 re_node_set eclosure_elem;
1692 if (node_idx == dfa->nodes_len)
1693 {
1694 if (!incomplete)
1695 break;
1696 incomplete = false;
1697 node_idx = 0;
1698 }
1699
1700 #ifdef DEBUG
1701 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1702 #endif
1703
1704 /* If we have already calculated, skip it. */
1705 if (dfa->eclosures[node_idx].nelem != 0)
1706 continue;
1707 /* Calculate epsilon closure of 'node_idx'. */
1708 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1709 if (BE (err != REG_NOERROR, 0))
1710 return err;
1711
1712 if (dfa->eclosures[node_idx].nelem == 0)
1713 {
1714 incomplete = true;
1715 re_node_set_free (&eclosure_elem);
1716 }
1717 }
1718 return REG_NOERROR;
1719 }
1720
1721 /* Calculate epsilon closure of NODE. */
1722
1723 static reg_errcode_t
1724 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1725 {
1726 reg_errcode_t err;
1727 Idx i;
1728 re_node_set eclosure;
1729 bool ok;
1730 bool incomplete = false;
1731 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1732 if (BE (err != REG_NOERROR, 0))
1733 return err;
1734
1735 /* This indicates that we are calculating this node now.
1736 We reference this value to avoid infinite loop. */
1737 dfa->eclosures[node].nelem = REG_MISSING;
1738
1739 /* If the current node has constraints, duplicate all nodes
1740 since they must inherit the constraints. */
1741 if (dfa->nodes[node].constraint
1742 && dfa->edests[node].nelem
1743 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1744 {
1745 err = duplicate_node_closure (dfa, node, node, node,
1746 dfa->nodes[node].constraint);
1747 if (BE (err != REG_NOERROR, 0))
1748 return err;
1749 }
1750
1751 /* Expand each epsilon destination nodes. */
1752 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1753 for (i = 0; i < dfa->edests[node].nelem; ++i)
1754 {
1755 re_node_set eclosure_elem;
1756 Idx edest = dfa->edests[node].elems[i];
1757 /* If calculating the epsilon closure of 'edest' is in progress,
1758 return intermediate result. */
1759 if (dfa->eclosures[edest].nelem == REG_MISSING)
1760 {
1761 incomplete = true;
1762 continue;
1763 }
1764 /* If we haven't calculated the epsilon closure of 'edest' yet,
1765 calculate now. Otherwise use calculated epsilon closure. */
1766 if (dfa->eclosures[edest].nelem == 0)
1767 {
1768 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1769 if (BE (err != REG_NOERROR, 0))
1770 return err;
1771 }
1772 else
1773 eclosure_elem = dfa->eclosures[edest];
1774 /* Merge the epsilon closure of 'edest'. */
1775 err = re_node_set_merge (&eclosure, &eclosure_elem);
1776 if (BE (err != REG_NOERROR, 0))
1777 return err;
1778 /* If the epsilon closure of 'edest' is incomplete,
1779 the epsilon closure of this node is also incomplete. */
1780 if (dfa->eclosures[edest].nelem == 0)
1781 {
1782 incomplete = true;
1783 re_node_set_free (&eclosure_elem);
1784 }
1785 }
1786
1787 /* An epsilon closure includes itself. */
1788 ok = re_node_set_insert (&eclosure, node);
1789 if (BE (! ok, 0))
1790 return REG_ESPACE;
1791 if (incomplete && !root)
1792 dfa->eclosures[node].nelem = 0;
1793 else
1794 dfa->eclosures[node] = eclosure;
1795 *new_set = eclosure;
1796 return REG_NOERROR;
1797 }
1798 \f
1799 /* Functions for token which are used in the parser. */
1800
1801 /* Fetch a token from INPUT.
1802 We must not use this function inside bracket expressions. */
1803
1804 static void
1805 internal_function
1806 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1807 {
1808 re_string_skip_bytes (input, peek_token (result, input, syntax));
1809 }
1810
1811 /* Peek a token from INPUT, and return the length of the token.
1812 We must not use this function inside bracket expressions. */
1813
1814 static int
1815 internal_function
1816 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1817 {
1818 unsigned char c;
1819
1820 if (re_string_eoi (input))
1821 {
1822 token->type = END_OF_RE;
1823 return 0;
1824 }
1825
1826 c = re_string_peek_byte (input, 0);
1827 token->opr.c = c;
1828
1829 token->word_char = 0;
1830 #ifdef RE_ENABLE_I18N
1831 token->mb_partial = 0;
1832 if (input->mb_cur_max > 1 &&
1833 !re_string_first_byte (input, re_string_cur_idx (input)))
1834 {
1835 token->type = CHARACTER;
1836 token->mb_partial = 1;
1837 return 1;
1838 }
1839 #endif
1840 if (c == '\\')
1841 {
1842 unsigned char c2;
1843 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1844 {
1845 token->type = BACK_SLASH;
1846 return 1;
1847 }
1848
1849 c2 = re_string_peek_byte_case (input, 1);
1850 token->opr.c = c2;
1851 token->type = CHARACTER;
1852 #ifdef RE_ENABLE_I18N
1853 if (input->mb_cur_max > 1)
1854 {
1855 wint_t wc = re_string_wchar_at (input,
1856 re_string_cur_idx (input) + 1);
1857 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1858 }
1859 else
1860 #endif
1861 token->word_char = IS_WORD_CHAR (c2) != 0;
1862
1863 switch (c2)
1864 {
1865 case '|':
1866 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1867 token->type = OP_ALT;
1868 break;
1869 case '1': case '2': case '3': case '4': case '5':
1870 case '6': case '7': case '8': case '9':
1871 if (!(syntax & RE_NO_BK_REFS))
1872 {
1873 token->type = OP_BACK_REF;
1874 token->opr.idx = c2 - '1';
1875 }
1876 break;
1877 case '<':
1878 if (!(syntax & RE_NO_GNU_OPS))
1879 {
1880 token->type = ANCHOR;
1881 token->opr.ctx_type = WORD_FIRST;
1882 }
1883 break;
1884 case '>':
1885 if (!(syntax & RE_NO_GNU_OPS))
1886 {
1887 token->type = ANCHOR;
1888 token->opr.ctx_type = WORD_LAST;
1889 }
1890 break;
1891 case 'b':
1892 if (!(syntax & RE_NO_GNU_OPS))
1893 {
1894 token->type = ANCHOR;
1895 token->opr.ctx_type = WORD_DELIM;
1896 }
1897 break;
1898 case 'B':
1899 if (!(syntax & RE_NO_GNU_OPS))
1900 {
1901 token->type = ANCHOR;
1902 token->opr.ctx_type = NOT_WORD_DELIM;
1903 }
1904 break;
1905 case 'w':
1906 if (!(syntax & RE_NO_GNU_OPS))
1907 token->type = OP_WORD;
1908 break;
1909 case 'W':
1910 if (!(syntax & RE_NO_GNU_OPS))
1911 token->type = OP_NOTWORD;
1912 break;
1913 case 's':
1914 if (!(syntax & RE_NO_GNU_OPS))
1915 token->type = OP_SPACE;
1916 break;
1917 case 'S':
1918 if (!(syntax & RE_NO_GNU_OPS))
1919 token->type = OP_NOTSPACE;
1920 break;
1921 case '`':
1922 if (!(syntax & RE_NO_GNU_OPS))
1923 {
1924 token->type = ANCHOR;
1925 token->opr.ctx_type = BUF_FIRST;
1926 }
1927 break;
1928 case '\'':
1929 if (!(syntax & RE_NO_GNU_OPS))
1930 {
1931 token->type = ANCHOR;
1932 token->opr.ctx_type = BUF_LAST;
1933 }
1934 break;
1935 case '(':
1936 if (!(syntax & RE_NO_BK_PARENS))
1937 token->type = OP_OPEN_SUBEXP;
1938 break;
1939 case ')':
1940 if (!(syntax & RE_NO_BK_PARENS))
1941 token->type = OP_CLOSE_SUBEXP;
1942 break;
1943 case '+':
1944 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1945 token->type = OP_DUP_PLUS;
1946 break;
1947 case '?':
1948 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1949 token->type = OP_DUP_QUESTION;
1950 break;
1951 case '{':
1952 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1953 token->type = OP_OPEN_DUP_NUM;
1954 break;
1955 case '}':
1956 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1957 token->type = OP_CLOSE_DUP_NUM;
1958 break;
1959 default:
1960 break;
1961 }
1962 return 2;
1963 }
1964
1965 token->type = CHARACTER;
1966 #ifdef RE_ENABLE_I18N
1967 if (input->mb_cur_max > 1)
1968 {
1969 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1970 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1971 }
1972 else
1973 #endif
1974 token->word_char = IS_WORD_CHAR (token->opr.c);
1975
1976 switch (c)
1977 {
1978 case '\n':
1979 if (syntax & RE_NEWLINE_ALT)
1980 token->type = OP_ALT;
1981 break;
1982 case '|':
1983 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1984 token->type = OP_ALT;
1985 break;
1986 case '*':
1987 token->type = OP_DUP_ASTERISK;
1988 break;
1989 case '+':
1990 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1991 token->type = OP_DUP_PLUS;
1992 break;
1993 case '?':
1994 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1995 token->type = OP_DUP_QUESTION;
1996 break;
1997 case '{':
1998 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1999 token->type = OP_OPEN_DUP_NUM;
2000 break;
2001 case '}':
2002 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
2003 token->type = OP_CLOSE_DUP_NUM;
2004 break;
2005 case '(':
2006 if (syntax & RE_NO_BK_PARENS)
2007 token->type = OP_OPEN_SUBEXP;
2008 break;
2009 case ')':
2010 if (syntax & RE_NO_BK_PARENS)
2011 token->type = OP_CLOSE_SUBEXP;
2012 break;
2013 case '[':
2014 token->type = OP_OPEN_BRACKET;
2015 break;
2016 case '.':
2017 token->type = OP_PERIOD;
2018 break;
2019 case '^':
2020 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2021 re_string_cur_idx (input) != 0)
2022 {
2023 char prev = re_string_peek_byte (input, -1);
2024 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2025 break;
2026 }
2027 token->type = ANCHOR;
2028 token->opr.ctx_type = LINE_FIRST;
2029 break;
2030 case '$':
2031 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2032 re_string_cur_idx (input) + 1 != re_string_length (input))
2033 {
2034 re_token_t next;
2035 re_string_skip_bytes (input, 1);
2036 peek_token (&next, input, syntax);
2037 re_string_skip_bytes (input, -1);
2038 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2039 break;
2040 }
2041 token->type = ANCHOR;
2042 token->opr.ctx_type = LINE_LAST;
2043 break;
2044 default:
2045 break;
2046 }
2047 return 1;
2048 }
2049
2050 /* Peek a token from INPUT, and return the length of the token.
2051 We must not use this function out of bracket expressions. */
2052
2053 static int
2054 internal_function
2055 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2056 {
2057 unsigned char c;
2058 if (re_string_eoi (input))
2059 {
2060 token->type = END_OF_RE;
2061 return 0;
2062 }
2063 c = re_string_peek_byte (input, 0);
2064 token->opr.c = c;
2065
2066 #ifdef RE_ENABLE_I18N
2067 if (input->mb_cur_max > 1 &&
2068 !re_string_first_byte (input, re_string_cur_idx (input)))
2069 {
2070 token->type = CHARACTER;
2071 return 1;
2072 }
2073 #endif /* RE_ENABLE_I18N */
2074
2075 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2076 && re_string_cur_idx (input) + 1 < re_string_length (input))
2077 {
2078 /* In this case, '\' escape a character. */
2079 unsigned char c2;
2080 re_string_skip_bytes (input, 1);
2081 c2 = re_string_peek_byte (input, 0);
2082 token->opr.c = c2;
2083 token->type = CHARACTER;
2084 return 1;
2085 }
2086 if (c == '[') /* '[' is a special char in a bracket exps. */
2087 {
2088 unsigned char c2;
2089 int token_len;
2090 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2091 c2 = re_string_peek_byte (input, 1);
2092 else
2093 c2 = 0;
2094 token->opr.c = c2;
2095 token_len = 2;
2096 switch (c2)
2097 {
2098 case '.':
2099 token->type = OP_OPEN_COLL_ELEM;
2100 break;
2101 case '=':
2102 token->type = OP_OPEN_EQUIV_CLASS;
2103 break;
2104 case ':':
2105 if (syntax & RE_CHAR_CLASSES)
2106 {
2107 token->type = OP_OPEN_CHAR_CLASS;
2108 break;
2109 }
2110 /* else fall through. */
2111 default:
2112 token->type = CHARACTER;
2113 token->opr.c = c;
2114 token_len = 1;
2115 break;
2116 }
2117 return token_len;
2118 }
2119 switch (c)
2120 {
2121 case '-':
2122 token->type = OP_CHARSET_RANGE;
2123 break;
2124 case ']':
2125 token->type = OP_CLOSE_BRACKET;
2126 break;
2127 case '^':
2128 token->type = OP_NON_MATCH_LIST;
2129 break;
2130 default:
2131 token->type = CHARACTER;
2132 }
2133 return 1;
2134 }
2135 \f
2136 /* Functions for parser. */
2137
2138 /* Entry point of the parser.
2139 Parse the regular expression REGEXP and return the structure tree.
2140 If an error occurs, ERR is set by error code, and return NULL.
2141 This function build the following tree, from regular expression <reg_exp>:
2142 CAT
2143 / \
2144 / \
2145 <reg_exp> EOR
2146
2147 CAT means concatenation.
2148 EOR means end of regular expression. */
2149
2150 static bin_tree_t *
2151 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2152 reg_errcode_t *err)
2153 {
2154 re_dfa_t *dfa = preg->buffer;
2155 bin_tree_t *tree, *eor, *root;
2156 re_token_t current_token;
2157 dfa->syntax = syntax;
2158 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2159 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2160 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2161 return NULL;
2162 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2163 if (tree != NULL)
2164 root = create_tree (dfa, tree, eor, CONCAT);
2165 else
2166 root = eor;
2167 if (BE (eor == NULL || root == NULL, 0))
2168 {
2169 *err = REG_ESPACE;
2170 return NULL;
2171 }
2172 return root;
2173 }
2174
2175 /* This function build the following tree, from regular expression
2176 <branch1>|<branch2>:
2177 ALT
2178 / \
2179 / \
2180 <branch1> <branch2>
2181
2182 ALT means alternative, which represents the operator '|'. */
2183
2184 static bin_tree_t *
2185 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2186 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2187 {
2188 re_dfa_t *dfa = preg->buffer;
2189 bin_tree_t *tree, *branch = NULL;
2190 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2191 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2192 return NULL;
2193
2194 while (token->type == OP_ALT)
2195 {
2196 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2197 if (token->type != OP_ALT && token->type != END_OF_RE
2198 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2199 {
2200 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2201 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2202 return NULL;
2203 }
2204 else
2205 branch = NULL;
2206 tree = create_tree (dfa, tree, branch, OP_ALT);
2207 if (BE (tree == NULL, 0))
2208 {
2209 *err = REG_ESPACE;
2210 return NULL;
2211 }
2212 }
2213 return tree;
2214 }
2215
2216 /* This function build the following tree, from regular expression
2217 <exp1><exp2>:
2218 CAT
2219 / \
2220 / \
2221 <exp1> <exp2>
2222
2223 CAT means concatenation. */
2224
2225 static bin_tree_t *
2226 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2227 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2228 {
2229 bin_tree_t *tree, *expr;
2230 re_dfa_t *dfa = preg->buffer;
2231 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2232 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2233 return NULL;
2234
2235 while (token->type != OP_ALT && token->type != END_OF_RE
2236 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2237 {
2238 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2239 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2240 {
2241 if (tree != NULL)
2242 postorder (tree, free_tree, NULL);
2243 return NULL;
2244 }
2245 if (tree != NULL && expr != NULL)
2246 {
2247 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2248 if (newtree == NULL)
2249 {
2250 postorder (expr, free_tree, NULL);
2251 postorder (tree, free_tree, NULL);
2252 *err = REG_ESPACE;
2253 return NULL;
2254 }
2255 tree = newtree;
2256 }
2257 else if (tree == NULL)
2258 tree = expr;
2259 /* Otherwise expr == NULL, we don't need to create new tree. */
2260 }
2261 return tree;
2262 }
2263
2264 /* This function build the following tree, from regular expression a*:
2265 *
2266 |
2267 a
2268 */
2269
2270 static bin_tree_t *
2271 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2272 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2273 {
2274 re_dfa_t *dfa = preg->buffer;
2275 bin_tree_t *tree;
2276 switch (token->type)
2277 {
2278 case CHARACTER:
2279 tree = create_token_tree (dfa, NULL, NULL, token);
2280 if (BE (tree == NULL, 0))
2281 {
2282 *err = REG_ESPACE;
2283 return NULL;
2284 }
2285 #ifdef RE_ENABLE_I18N
2286 if (dfa->mb_cur_max > 1)
2287 {
2288 while (!re_string_eoi (regexp)
2289 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2290 {
2291 bin_tree_t *mbc_remain;
2292 fetch_token (token, regexp, syntax);
2293 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2294 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2295 if (BE (mbc_remain == NULL || tree == NULL, 0))
2296 {
2297 *err = REG_ESPACE;
2298 return NULL;
2299 }
2300 }
2301 }
2302 #endif
2303 break;
2304 case OP_OPEN_SUBEXP:
2305 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2306 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2307 return NULL;
2308 break;
2309 case OP_OPEN_BRACKET:
2310 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2311 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2312 return NULL;
2313 break;
2314 case OP_BACK_REF:
2315 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2316 {
2317 *err = REG_ESUBREG;
2318 return NULL;
2319 }
2320 dfa->used_bkref_map |= 1 << token->opr.idx;
2321 tree = create_token_tree (dfa, NULL, NULL, token);
2322 if (BE (tree == NULL, 0))
2323 {
2324 *err = REG_ESPACE;
2325 return NULL;
2326 }
2327 ++dfa->nbackref;
2328 dfa->has_mb_node = 1;
2329 break;
2330 case OP_OPEN_DUP_NUM:
2331 if (syntax & RE_CONTEXT_INVALID_DUP)
2332 {
2333 *err = REG_BADRPT;
2334 return NULL;
2335 }
2336 /* FALLTHROUGH */
2337 case OP_DUP_ASTERISK:
2338 case OP_DUP_PLUS:
2339 case OP_DUP_QUESTION:
2340 if (syntax & RE_CONTEXT_INVALID_OPS)
2341 {
2342 *err = REG_BADRPT;
2343 return NULL;
2344 }
2345 else if (syntax & RE_CONTEXT_INDEP_OPS)
2346 {
2347 fetch_token (token, regexp, syntax);
2348 return parse_expression (regexp, preg, token, syntax, nest, err);
2349 }
2350 /* else fall through */
2351 case OP_CLOSE_SUBEXP:
2352 if ((token->type == OP_CLOSE_SUBEXP) &&
2353 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2354 {
2355 *err = REG_ERPAREN;
2356 return NULL;
2357 }
2358 /* else fall through */
2359 case OP_CLOSE_DUP_NUM:
2360 /* We treat it as a normal character. */
2361
2362 /* Then we can these characters as normal characters. */
2363 token->type = CHARACTER;
2364 /* mb_partial and word_char bits should be initialized already
2365 by peek_token. */
2366 tree = create_token_tree (dfa, NULL, NULL, token);
2367 if (BE (tree == NULL, 0))
2368 {
2369 *err = REG_ESPACE;
2370 return NULL;
2371 }
2372 break;
2373 case ANCHOR:
2374 if ((token->opr.ctx_type
2375 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2376 && dfa->word_ops_used == 0)
2377 init_word_char (dfa);
2378 if (token->opr.ctx_type == WORD_DELIM
2379 || token->opr.ctx_type == NOT_WORD_DELIM)
2380 {
2381 bin_tree_t *tree_first, *tree_last;
2382 if (token->opr.ctx_type == WORD_DELIM)
2383 {
2384 token->opr.ctx_type = WORD_FIRST;
2385 tree_first = create_token_tree (dfa, NULL, NULL, token);
2386 token->opr.ctx_type = WORD_LAST;
2387 }
2388 else
2389 {
2390 token->opr.ctx_type = INSIDE_WORD;
2391 tree_first = create_token_tree (dfa, NULL, NULL, token);
2392 token->opr.ctx_type = INSIDE_NOTWORD;
2393 }
2394 tree_last = create_token_tree (dfa, NULL, NULL, token);
2395 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2396 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2397 {
2398 *err = REG_ESPACE;
2399 return NULL;
2400 }
2401 }
2402 else
2403 {
2404 tree = create_token_tree (dfa, NULL, NULL, token);
2405 if (BE (tree == NULL, 0))
2406 {
2407 *err = REG_ESPACE;
2408 return NULL;
2409 }
2410 }
2411 /* We must return here, since ANCHORs can't be followed
2412 by repetition operators.
2413 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2414 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2415 fetch_token (token, regexp, syntax);
2416 return tree;
2417 case OP_PERIOD:
2418 tree = create_token_tree (dfa, NULL, NULL, token);
2419 if (BE (tree == NULL, 0))
2420 {
2421 *err = REG_ESPACE;
2422 return NULL;
2423 }
2424 if (dfa->mb_cur_max > 1)
2425 dfa->has_mb_node = 1;
2426 break;
2427 case OP_WORD:
2428 case OP_NOTWORD:
2429 tree = build_charclass_op (dfa, regexp->trans,
2430 "alnum",
2431 "_",
2432 token->type == OP_NOTWORD, err);
2433 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2434 return NULL;
2435 break;
2436 case OP_SPACE:
2437 case OP_NOTSPACE:
2438 tree = build_charclass_op (dfa, regexp->trans,
2439 "space",
2440 "",
2441 token->type == OP_NOTSPACE, err);
2442 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2443 return NULL;
2444 break;
2445 case OP_ALT:
2446 case END_OF_RE:
2447 return NULL;
2448 case BACK_SLASH:
2449 *err = REG_EESCAPE;
2450 return NULL;
2451 default:
2452 /* Must not happen? */
2453 #ifdef DEBUG
2454 assert (0);
2455 #endif
2456 return NULL;
2457 }
2458 fetch_token (token, regexp, syntax);
2459
2460 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2461 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2462 {
2463 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2464 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2465 return NULL;
2466 /* In BRE consecutive duplications are not allowed. */
2467 if ((syntax & RE_CONTEXT_INVALID_DUP)
2468 && (token->type == OP_DUP_ASTERISK
2469 || token->type == OP_OPEN_DUP_NUM))
2470 {
2471 *err = REG_BADRPT;
2472 return NULL;
2473 }
2474 }
2475
2476 return tree;
2477 }
2478
2479 /* This function build the following tree, from regular expression
2480 (<reg_exp>):
2481 SUBEXP
2482 |
2483 <reg_exp>
2484 */
2485
2486 static bin_tree_t *
2487 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2488 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2489 {
2490 re_dfa_t *dfa = preg->buffer;
2491 bin_tree_t *tree;
2492 size_t cur_nsub;
2493 cur_nsub = preg->re_nsub++;
2494
2495 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2496
2497 /* The subexpression may be a null string. */
2498 if (token->type == OP_CLOSE_SUBEXP)
2499 tree = NULL;
2500 else
2501 {
2502 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2503 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2504 {
2505 if (tree != NULL)
2506 postorder (tree, free_tree, NULL);
2507 *err = REG_EPAREN;
2508 }
2509 if (BE (*err != REG_NOERROR, 0))
2510 return NULL;
2511 }
2512
2513 if (cur_nsub <= '9' - '1')
2514 dfa->completed_bkref_map |= 1 << cur_nsub;
2515
2516 tree = create_tree (dfa, tree, NULL, SUBEXP);
2517 if (BE (tree == NULL, 0))
2518 {
2519 *err = REG_ESPACE;
2520 return NULL;
2521 }
2522 tree->token.opr.idx = cur_nsub;
2523 return tree;
2524 }
2525
2526 /* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2527
2528 static bin_tree_t *
2529 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2530 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2531 {
2532 bin_tree_t *tree = NULL, *old_tree = NULL;
2533 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2534 re_token_t start_token = *token;
2535
2536 if (token->type == OP_OPEN_DUP_NUM)
2537 {
2538 end = 0;
2539 start = fetch_number (regexp, token, syntax);
2540 if (start == REG_MISSING)
2541 {
2542 if (token->type == CHARACTER && token->opr.c == ',')
2543 start = 0; /* We treat "{,m}" as "{0,m}". */
2544 else
2545 {
2546 *err = REG_BADBR; /* <re>{} is invalid. */
2547 return NULL;
2548 }
2549 }
2550 if (BE (start != REG_ERROR, 1))
2551 {
2552 /* We treat "{n}" as "{n,n}". */
2553 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2554 : ((token->type == CHARACTER && token->opr.c == ',')
2555 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2556 }
2557 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2558 {
2559 /* Invalid sequence. */
2560 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2561 {
2562 if (token->type == END_OF_RE)
2563 *err = REG_EBRACE;
2564 else
2565 *err = REG_BADBR;
2566
2567 return NULL;
2568 }
2569
2570 /* If the syntax bit is set, rollback. */
2571 re_string_set_index (regexp, start_idx);
2572 *token = start_token;
2573 token->type = CHARACTER;
2574 /* mb_partial and word_char bits should be already initialized by
2575 peek_token. */
2576 return elem;
2577 }
2578
2579 if (BE ((end != REG_MISSING && start > end)
2580 || token->type != OP_CLOSE_DUP_NUM, 0))
2581 {
2582 /* First number greater than second. */
2583 *err = REG_BADBR;
2584 return NULL;
2585 }
2586
2587 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2588 {
2589 *err = REG_ESIZE;
2590 return NULL;
2591 }
2592 }
2593 else
2594 {
2595 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2596 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2597 }
2598
2599 fetch_token (token, regexp, syntax);
2600
2601 if (BE (elem == NULL, 0))
2602 return NULL;
2603 if (BE (start == 0 && end == 0, 0))
2604 {
2605 postorder (elem, free_tree, NULL);
2606 return NULL;
2607 }
2608
2609 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2610 if (BE (start > 0, 0))
2611 {
2612 tree = elem;
2613 for (i = 2; i <= start; ++i)
2614 {
2615 elem = duplicate_tree (elem, dfa);
2616 tree = create_tree (dfa, tree, elem, CONCAT);
2617 if (BE (elem == NULL || tree == NULL, 0))
2618 goto parse_dup_op_espace;
2619 }
2620
2621 if (start == end)
2622 return tree;
2623
2624 /* Duplicate ELEM before it is marked optional. */
2625 elem = duplicate_tree (elem, dfa);
2626 old_tree = tree;
2627 }
2628 else
2629 old_tree = NULL;
2630
2631 if (elem->token.type == SUBEXP)
2632 {
2633 uintptr_t subidx = elem->token.opr.idx;
2634 postorder (elem, mark_opt_subexp, (void *) subidx);
2635 }
2636
2637 tree = create_tree (dfa, elem, NULL,
2638 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2639 if (BE (tree == NULL, 0))
2640 goto parse_dup_op_espace;
2641
2642 /* From gnulib's "intprops.h":
2643 True if the arithmetic type T is signed. */
2644 #define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2645
2646 /* This loop is actually executed only when end != REG_MISSING,
2647 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2648 already created the start+1-th copy. */
2649 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2650 for (i = start + 2; i <= end; ++i)
2651 {
2652 elem = duplicate_tree (elem, dfa);
2653 tree = create_tree (dfa, tree, elem, CONCAT);
2654 if (BE (elem == NULL || tree == NULL, 0))
2655 goto parse_dup_op_espace;
2656
2657 tree = create_tree (dfa, tree, NULL, OP_ALT);
2658 if (BE (tree == NULL, 0))
2659 goto parse_dup_op_espace;
2660 }
2661
2662 if (old_tree)
2663 tree = create_tree (dfa, old_tree, tree, CONCAT);
2664
2665 return tree;
2666
2667 parse_dup_op_espace:
2668 *err = REG_ESPACE;
2669 return NULL;
2670 }
2671
2672 /* Size of the names for collating symbol/equivalence_class/character_class.
2673 I'm not sure, but maybe enough. */
2674 #define BRACKET_NAME_BUF_SIZE 32
2675
2676 #ifndef _LIBC
2677 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2678 Build the range expression which starts from START_ELEM, and ends
2679 at END_ELEM. The result are written to MBCSET and SBCSET.
2680 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2681 mbcset->range_ends, is a pointer argument since we may
2682 update it. */
2683
2684 static reg_errcode_t
2685 internal_function
2686 # ifdef RE_ENABLE_I18N
2687 build_range_exp (const reg_syntax_t syntax,
2688 bitset_t sbcset,
2689 re_charset_t *mbcset,
2690 Idx *range_alloc,
2691 const bracket_elem_t *start_elem,
2692 const bracket_elem_t *end_elem)
2693 # else /* not RE_ENABLE_I18N */
2694 build_range_exp (const reg_syntax_t syntax,
2695 bitset_t sbcset,
2696 const bracket_elem_t *start_elem,
2697 const bracket_elem_t *end_elem)
2698 # endif /* not RE_ENABLE_I18N */
2699 {
2700 unsigned int start_ch, end_ch;
2701 /* Equivalence Classes and Character Classes can't be a range start/end. */
2702 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2703 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2704 0))
2705 return REG_ERANGE;
2706
2707 /* We can handle no multi character collating elements without libc
2708 support. */
2709 if (BE ((start_elem->type == COLL_SYM
2710 && strlen ((char *) start_elem->opr.name) > 1)
2711 || (end_elem->type == COLL_SYM
2712 && strlen ((char *) end_elem->opr.name) > 1), 0))
2713 return REG_ECOLLATE;
2714
2715 # ifdef RE_ENABLE_I18N
2716 {
2717 wchar_t wc;
2718 wint_t start_wc;
2719 wint_t end_wc;
2720
2721 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2722 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2723 : 0));
2724 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2725 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2726 : 0));
2727 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2728 ? __btowc (start_ch) : start_elem->opr.wch);
2729 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2730 ? __btowc (end_ch) : end_elem->opr.wch);
2731 if (start_wc == WEOF || end_wc == WEOF)
2732 return REG_ECOLLATE;
2733 else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
2734 return REG_ERANGE;
2735
2736 /* Got valid collation sequence values, add them as a new entry.
2737 However, for !_LIBC we have no collation elements: if the
2738 character set is single byte, the single byte character set
2739 that we build below suffices. parse_bracket_exp passes
2740 no MBCSET if dfa->mb_cur_max == 1. */
2741 if (mbcset)
2742 {
2743 /* Check the space of the arrays. */
2744 if (BE (*range_alloc == mbcset->nranges, 0))
2745 {
2746 /* There is not enough space, need realloc. */
2747 wchar_t *new_array_start, *new_array_end;
2748 Idx new_nranges;
2749
2750 /* +1 in case of mbcset->nranges is 0. */
2751 new_nranges = 2 * mbcset->nranges + 1;
2752 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2753 are NULL if *range_alloc == 0. */
2754 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2755 new_nranges);
2756 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2757 new_nranges);
2758
2759 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2760 return REG_ESPACE;
2761
2762 mbcset->range_starts = new_array_start;
2763 mbcset->range_ends = new_array_end;
2764 *range_alloc = new_nranges;
2765 }
2766
2767 mbcset->range_starts[mbcset->nranges] = start_wc;
2768 mbcset->range_ends[mbcset->nranges++] = end_wc;
2769 }
2770
2771 /* Build the table for single byte characters. */
2772 for (wc = 0; wc < SBC_MAX; ++wc)
2773 {
2774 if (start_wc <= wc && wc <= end_wc)
2775 bitset_set (sbcset, wc);
2776 }
2777 }
2778 # else /* not RE_ENABLE_I18N */
2779 {
2780 unsigned int ch;
2781 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2782 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2783 : 0));
2784 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2785 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2786 : 0));
2787 if (start_ch > end_ch)
2788 return REG_ERANGE;
2789 /* Build the table for single byte characters. */
2790 for (ch = 0; ch < SBC_MAX; ++ch)
2791 if (start_ch <= ch && ch <= end_ch)
2792 bitset_set (sbcset, ch);
2793 }
2794 # endif /* not RE_ENABLE_I18N */
2795 return REG_NOERROR;
2796 }
2797 #endif /* not _LIBC */
2798
2799 #ifndef _LIBC
2800 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2801 Build the collating element which is represented by NAME.
2802 The result are written to MBCSET and SBCSET.
2803 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2804 pointer argument since we may update it. */
2805
2806 static reg_errcode_t
2807 internal_function
2808 # ifdef RE_ENABLE_I18N
2809 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2810 Idx *coll_sym_alloc, const unsigned char *name)
2811 # else /* not RE_ENABLE_I18N */
2812 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2813 # endif /* not RE_ENABLE_I18N */
2814 {
2815 size_t name_len = strlen ((const char *) name);
2816 if (BE (name_len != 1, 0))
2817 return REG_ECOLLATE;
2818 else
2819 {
2820 bitset_set (sbcset, name[0]);
2821 return REG_NOERROR;
2822 }
2823 }
2824 #endif /* not _LIBC */
2825
2826 /* This function parse bracket expression like "[abc]", "[a-c]",
2827 "[[.a-a.]]" etc. */
2828
2829 static bin_tree_t *
2830 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2831 reg_syntax_t syntax, reg_errcode_t *err)
2832 {
2833 #ifdef _LIBC
2834 const unsigned char *collseqmb;
2835 const char *collseqwc;
2836 uint32_t nrules;
2837 int32_t table_size;
2838 const int32_t *symb_table;
2839 const unsigned char *extra;
2840
2841 /* Local function for parse_bracket_exp used in _LIBC environment.
2842 Seek the collating symbol entry corresponding to NAME.
2843 Return the index of the symbol in the SYMB_TABLE,
2844 or -1 if not found. */
2845
2846 auto inline int32_t
2847 __attribute__ ((always_inline))
2848 seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2849 {
2850 int32_t elem;
2851
2852 for (elem = 0; elem < table_size; elem++)
2853 if (symb_table[2 * elem] != 0)
2854 {
2855 int32_t idx = symb_table[2 * elem + 1];
2856 /* Skip the name of collating element name. */
2857 idx += 1 + extra[idx];
2858 if (/* Compare the length of the name. */
2859 name_len == extra[idx]
2860 /* Compare the name. */
2861 && memcmp (name, &extra[idx + 1], name_len) == 0)
2862 /* Yep, this is the entry. */
2863 return elem;
2864 }
2865 return -1;
2866 }
2867
2868 /* Local function for parse_bracket_exp used in _LIBC environment.
2869 Look up the collation sequence value of BR_ELEM.
2870 Return the value if succeeded, UINT_MAX otherwise. */
2871
2872 auto inline unsigned int
2873 __attribute__ ((always_inline))
2874 lookup_collation_sequence_value (bracket_elem_t *br_elem)
2875 {
2876 if (br_elem->type == SB_CHAR)
2877 {
2878 /*
2879 if (MB_CUR_MAX == 1)
2880 */
2881 if (nrules == 0)
2882 return collseqmb[br_elem->opr.ch];
2883 else
2884 {
2885 wint_t wc = __btowc (br_elem->opr.ch);
2886 return __collseq_table_lookup (collseqwc, wc);
2887 }
2888 }
2889 else if (br_elem->type == MB_CHAR)
2890 {
2891 if (nrules != 0)
2892 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2893 }
2894 else if (br_elem->type == COLL_SYM)
2895 {
2896 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2897 if (nrules != 0)
2898 {
2899 int32_t elem, idx;
2900 elem = seek_collating_symbol_entry (br_elem->opr.name,
2901 sym_name_len);
2902 if (elem != -1)
2903 {
2904 /* We found the entry. */
2905 idx = symb_table[2 * elem + 1];
2906 /* Skip the name of collating element name. */
2907 idx += 1 + extra[idx];
2908 /* Skip the byte sequence of the collating element. */
2909 idx += 1 + extra[idx];
2910 /* Adjust for the alignment. */
2911 idx = (idx + 3) & ~3;
2912 /* Skip the multibyte collation sequence value. */
2913 idx += sizeof (unsigned int);
2914 /* Skip the wide char sequence of the collating element. */
2915 idx += sizeof (unsigned int) *
2916 (1 + *(unsigned int *) (extra + idx));
2917 /* Return the collation sequence value. */
2918 return *(unsigned int *) (extra + idx);
2919 }
2920 else if (sym_name_len == 1)
2921 {
2922 /* No valid character. Match it as a single byte
2923 character. */
2924 return collseqmb[br_elem->opr.name[0]];
2925 }
2926 }
2927 else if (sym_name_len == 1)
2928 return collseqmb[br_elem->opr.name[0]];
2929 }
2930 return UINT_MAX;
2931 }
2932
2933 /* Local function for parse_bracket_exp used in _LIBC environment.
2934 Build the range expression which starts from START_ELEM, and ends
2935 at END_ELEM. The result are written to MBCSET and SBCSET.
2936 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2937 mbcset->range_ends, is a pointer argument since we may
2938 update it. */
2939
2940 auto inline reg_errcode_t
2941 __attribute__ ((always_inline))
2942 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2943 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2944 {
2945 unsigned int ch;
2946 uint32_t start_collseq;
2947 uint32_t end_collseq;
2948
2949 /* Equivalence Classes and Character Classes can't be a range
2950 start/end. */
2951 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2952 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2953 0))
2954 return REG_ERANGE;
2955
2956 /* FIXME: Implement rational ranges here, too. */
2957 start_collseq = lookup_collation_sequence_value (start_elem);
2958 end_collseq = lookup_collation_sequence_value (end_elem);
2959 /* Check start/end collation sequence values. */
2960 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2961 return REG_ECOLLATE;
2962 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2963 return REG_ERANGE;
2964
2965 /* Got valid collation sequence values, add them as a new entry.
2966 However, if we have no collation elements, and the character set
2967 is single byte, the single byte character set that we
2968 build below suffices. */
2969 if (nrules > 0 || dfa->mb_cur_max > 1)
2970 {
2971 /* Check the space of the arrays. */
2972 if (BE (*range_alloc == mbcset->nranges, 0))
2973 {
2974 /* There is not enough space, need realloc. */
2975 uint32_t *new_array_start;
2976 uint32_t *new_array_end;
2977 Idx new_nranges;
2978
2979 /* +1 in case of mbcset->nranges is 0. */
2980 new_nranges = 2 * mbcset->nranges + 1;
2981 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2982 new_nranges);
2983 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2984 new_nranges);
2985
2986 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2987 return REG_ESPACE;
2988
2989 mbcset->range_starts = new_array_start;
2990 mbcset->range_ends = new_array_end;
2991 *range_alloc = new_nranges;
2992 }
2993
2994 mbcset->range_starts[mbcset->nranges] = start_collseq;
2995 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2996 }
2997
2998 /* Build the table for single byte characters. */
2999 for (ch = 0; ch < SBC_MAX; ch++)
3000 {
3001 uint32_t ch_collseq;
3002 /*
3003 if (MB_CUR_MAX == 1)
3004 */
3005 if (nrules == 0)
3006 ch_collseq = collseqmb[ch];
3007 else
3008 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3009 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3010 bitset_set (sbcset, ch);
3011 }
3012 return REG_NOERROR;
3013 }
3014
3015 /* Local function for parse_bracket_exp used in _LIBC environment.
3016 Build the collating element which is represented by NAME.
3017 The result are written to MBCSET and SBCSET.
3018 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3019 pointer argument since we may update it. */
3020
3021 auto inline reg_errcode_t
3022 __attribute__ ((always_inline))
3023 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
3024 Idx *coll_sym_alloc, const unsigned char *name)
3025 {
3026 int32_t elem, idx;
3027 size_t name_len = strlen ((const char *) name);
3028 if (nrules != 0)
3029 {
3030 elem = seek_collating_symbol_entry (name, name_len);
3031 if (elem != -1)
3032 {
3033 /* We found the entry. */
3034 idx = symb_table[2 * elem + 1];
3035 /* Skip the name of collating element name. */
3036 idx += 1 + extra[idx];
3037 }
3038 else if (name_len == 1)
3039 {
3040 /* No valid character, treat it as a normal
3041 character. */
3042 bitset_set (sbcset, name[0]);
3043 return REG_NOERROR;
3044 }
3045 else
3046 return REG_ECOLLATE;
3047
3048 /* Got valid collation sequence, add it as a new entry. */
3049 /* Check the space of the arrays. */
3050 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3051 {
3052 /* Not enough, realloc it. */
3053 /* +1 in case of mbcset->ncoll_syms is 0. */
3054 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3055 /* Use realloc since mbcset->coll_syms is NULL
3056 if *alloc == 0. */
3057 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3058 new_coll_sym_alloc);
3059 if (BE (new_coll_syms == NULL, 0))
3060 return REG_ESPACE;
3061 mbcset->coll_syms = new_coll_syms;
3062 *coll_sym_alloc = new_coll_sym_alloc;
3063 }
3064 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3065 return REG_NOERROR;
3066 }
3067 else
3068 {
3069 if (BE (name_len != 1, 0))
3070 return REG_ECOLLATE;
3071 else
3072 {
3073 bitset_set (sbcset, name[0]);
3074 return REG_NOERROR;
3075 }
3076 }
3077 }
3078 #endif
3079
3080 re_token_t br_token;
3081 re_bitset_ptr_t sbcset;
3082 #ifdef RE_ENABLE_I18N
3083 re_charset_t *mbcset;
3084 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3085 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3086 #endif /* not RE_ENABLE_I18N */
3087 bool non_match = false;
3088 bin_tree_t *work_tree;
3089 int token_len;
3090 bool first_round = true;
3091 #ifdef _LIBC
3092 collseqmb = (const unsigned char *)
3093 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3094 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3095 if (nrules)
3096 {
3097 /*
3098 if (MB_CUR_MAX > 1)
3099 */
3100 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3101 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3102 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3103 _NL_COLLATE_SYMB_TABLEMB);
3104 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3105 _NL_COLLATE_SYMB_EXTRAMB);
3106 }
3107 #endif
3108 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3109 #ifdef RE_ENABLE_I18N
3110 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3111 #endif /* RE_ENABLE_I18N */
3112 #ifdef RE_ENABLE_I18N
3113 if (BE (sbcset == NULL || mbcset == NULL, 0))
3114 #else
3115 if (BE (sbcset == NULL, 0))
3116 #endif /* RE_ENABLE_I18N */
3117 {
3118 re_free (sbcset);
3119 #ifdef RE_ENABLE_I18N
3120 re_free (mbcset);
3121 #endif
3122 *err = REG_ESPACE;
3123 return NULL;
3124 }
3125
3126 token_len = peek_token_bracket (token, regexp, syntax);
3127 if (BE (token->type == END_OF_RE, 0))
3128 {
3129 *err = REG_BADPAT;
3130 goto parse_bracket_exp_free_return;
3131 }
3132 if (token->type == OP_NON_MATCH_LIST)
3133 {
3134 #ifdef RE_ENABLE_I18N
3135 mbcset->non_match = 1;
3136 #endif /* not RE_ENABLE_I18N */
3137 non_match = true;
3138 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3139 bitset_set (sbcset, '\n');
3140 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3141 token_len = peek_token_bracket (token, regexp, syntax);
3142 if (BE (token->type == END_OF_RE, 0))
3143 {
3144 *err = REG_BADPAT;
3145 goto parse_bracket_exp_free_return;
3146 }
3147 }
3148
3149 /* We treat the first ']' as a normal character. */
3150 if (token->type == OP_CLOSE_BRACKET)
3151 token->type = CHARACTER;
3152
3153 while (1)
3154 {
3155 bracket_elem_t start_elem, end_elem;
3156 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3157 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3158 reg_errcode_t ret;
3159 int token_len2 = 0;
3160 bool is_range_exp = false;
3161 re_token_t token2;
3162
3163 start_elem.opr.name = start_name_buf;
3164 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3165 syntax, first_round);
3166 if (BE (ret != REG_NOERROR, 0))
3167 {
3168 *err = ret;
3169 goto parse_bracket_exp_free_return;
3170 }
3171 first_round = false;
3172
3173 /* Get information about the next token. We need it in any case. */
3174 token_len = peek_token_bracket (token, regexp, syntax);
3175
3176 /* Do not check for ranges if we know they are not allowed. */
3177 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3178 {
3179 if (BE (token->type == END_OF_RE, 0))
3180 {
3181 *err = REG_EBRACK;
3182 goto parse_bracket_exp_free_return;
3183 }
3184 if (token->type == OP_CHARSET_RANGE)
3185 {
3186 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3187 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3188 if (BE (token2.type == END_OF_RE, 0))
3189 {
3190 *err = REG_EBRACK;
3191 goto parse_bracket_exp_free_return;
3192 }
3193 if (token2.type == OP_CLOSE_BRACKET)
3194 {
3195 /* We treat the last '-' as a normal character. */
3196 re_string_skip_bytes (regexp, -token_len);
3197 token->type = CHARACTER;
3198 }
3199 else
3200 is_range_exp = true;
3201 }
3202 }
3203
3204 if (is_range_exp == true)
3205 {
3206 end_elem.opr.name = end_name_buf;
3207 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3208 dfa, syntax, true);
3209 if (BE (ret != REG_NOERROR, 0))
3210 {
3211 *err = ret;
3212 goto parse_bracket_exp_free_return;
3213 }
3214
3215 token_len = peek_token_bracket (token, regexp, syntax);
3216
3217 #ifdef _LIBC
3218 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3219 &start_elem, &end_elem);
3220 #else
3221 # ifdef RE_ENABLE_I18N
3222 *err = build_range_exp (syntax, sbcset,
3223 dfa->mb_cur_max > 1 ? mbcset : NULL,
3224 &range_alloc, &start_elem, &end_elem);
3225 # else
3226 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3227 # endif
3228 #endif /* RE_ENABLE_I18N */
3229 if (BE (*err != REG_NOERROR, 0))
3230 goto parse_bracket_exp_free_return;
3231 }
3232 else
3233 {
3234 switch (start_elem.type)
3235 {
3236 case SB_CHAR:
3237 bitset_set (sbcset, start_elem.opr.ch);
3238 break;
3239 #ifdef RE_ENABLE_I18N
3240 case MB_CHAR:
3241 /* Check whether the array has enough space. */
3242 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3243 {
3244 wchar_t *new_mbchars;
3245 /* Not enough, realloc it. */
3246 /* +1 in case of mbcset->nmbchars is 0. */
3247 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3248 /* Use realloc since array is NULL if *alloc == 0. */
3249 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3250 mbchar_alloc);
3251 if (BE (new_mbchars == NULL, 0))
3252 goto parse_bracket_exp_espace;
3253 mbcset->mbchars = new_mbchars;
3254 }
3255 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3256 break;
3257 #endif /* RE_ENABLE_I18N */
3258 case EQUIV_CLASS:
3259 *err = build_equiv_class (sbcset,
3260 #ifdef RE_ENABLE_I18N
3261 mbcset, &equiv_class_alloc,
3262 #endif /* RE_ENABLE_I18N */
3263 start_elem.opr.name);
3264 if (BE (*err != REG_NOERROR, 0))
3265 goto parse_bracket_exp_free_return;
3266 break;
3267 case COLL_SYM:
3268 *err = build_collating_symbol (sbcset,
3269 #ifdef RE_ENABLE_I18N
3270 mbcset, &coll_sym_alloc,
3271 #endif /* RE_ENABLE_I18N */
3272 start_elem.opr.name);
3273 if (BE (*err != REG_NOERROR, 0))
3274 goto parse_bracket_exp_free_return;
3275 break;
3276 case CHAR_CLASS:
3277 *err = build_charclass (regexp->trans, sbcset,
3278 #ifdef RE_ENABLE_I18N
3279 mbcset, &char_class_alloc,
3280 #endif /* RE_ENABLE_I18N */
3281 (const char *) start_elem.opr.name,
3282 syntax);
3283 if (BE (*err != REG_NOERROR, 0))
3284 goto parse_bracket_exp_free_return;
3285 break;
3286 default:
3287 assert (0);
3288 break;
3289 }
3290 }
3291 if (BE (token->type == END_OF_RE, 0))
3292 {
3293 *err = REG_EBRACK;
3294 goto parse_bracket_exp_free_return;
3295 }
3296 if (token->type == OP_CLOSE_BRACKET)
3297 break;
3298 }
3299
3300 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3301
3302 /* If it is non-matching list. */
3303 if (non_match)
3304 bitset_not (sbcset);
3305
3306 #ifdef RE_ENABLE_I18N
3307 /* Ensure only single byte characters are set. */
3308 if (dfa->mb_cur_max > 1)
3309 bitset_mask (sbcset, dfa->sb_char);
3310
3311 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3312 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3313 || mbcset->non_match)))
3314 {
3315 bin_tree_t *mbc_tree;
3316 int sbc_idx;
3317 /* Build a tree for complex bracket. */
3318 dfa->has_mb_node = 1;
3319 br_token.type = COMPLEX_BRACKET;
3320 br_token.opr.mbcset = mbcset;
3321 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3322 if (BE (mbc_tree == NULL, 0))
3323 goto parse_bracket_exp_espace;
3324 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3325 if (sbcset[sbc_idx])
3326 break;
3327 /* If there are no bits set in sbcset, there is no point
3328 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3329 if (sbc_idx < BITSET_WORDS)
3330 {
3331 /* Build a tree for simple bracket. */
3332 br_token.type = SIMPLE_BRACKET;
3333 br_token.opr.sbcset = sbcset;
3334 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3335 if (BE (work_tree == NULL, 0))
3336 goto parse_bracket_exp_espace;
3337
3338 /* Then join them by ALT node. */
3339 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3340 if (BE (work_tree == NULL, 0))
3341 goto parse_bracket_exp_espace;
3342 }
3343 else
3344 {
3345 re_free (sbcset);
3346 work_tree = mbc_tree;
3347 }
3348 }
3349 else
3350 #endif /* not RE_ENABLE_I18N */
3351 {
3352 #ifdef RE_ENABLE_I18N
3353 free_charset (mbcset);
3354 #endif
3355 /* Build a tree for simple bracket. */
3356 br_token.type = SIMPLE_BRACKET;
3357 br_token.opr.sbcset = sbcset;
3358 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3359 if (BE (work_tree == NULL, 0))
3360 goto parse_bracket_exp_espace;
3361 }
3362 return work_tree;
3363
3364 parse_bracket_exp_espace:
3365 *err = REG_ESPACE;
3366 parse_bracket_exp_free_return:
3367 re_free (sbcset);
3368 #ifdef RE_ENABLE_I18N
3369 free_charset (mbcset);
3370 #endif /* RE_ENABLE_I18N */
3371 return NULL;
3372 }
3373
3374 /* Parse an element in the bracket expression. */
3375
3376 static reg_errcode_t
3377 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3378 re_token_t *token, int token_len, re_dfa_t *dfa,
3379 reg_syntax_t syntax, bool accept_hyphen)
3380 {
3381 #ifdef RE_ENABLE_I18N
3382 int cur_char_size;
3383 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3384 if (cur_char_size > 1)
3385 {
3386 elem->type = MB_CHAR;
3387 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3388 re_string_skip_bytes (regexp, cur_char_size);
3389 return REG_NOERROR;
3390 }
3391 #endif /* RE_ENABLE_I18N */
3392 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3393 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3394 || token->type == OP_OPEN_EQUIV_CLASS)
3395 return parse_bracket_symbol (elem, regexp, token);
3396 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3397 {
3398 /* A '-' must only appear as anything but a range indicator before
3399 the closing bracket. Everything else is an error. */
3400 re_token_t token2;
3401 (void) peek_token_bracket (&token2, regexp, syntax);
3402 if (token2.type != OP_CLOSE_BRACKET)
3403 /* The actual error value is not standardized since this whole
3404 case is undefined. But ERANGE makes good sense. */
3405 return REG_ERANGE;
3406 }
3407 elem->type = SB_CHAR;
3408 elem->opr.ch = token->opr.c;
3409 return REG_NOERROR;
3410 }
3411
3412 /* Parse a bracket symbol in the bracket expression. Bracket symbols are
3413 such as [:<character_class>:], [.<collating_element>.], and
3414 [=<equivalent_class>=]. */
3415
3416 static reg_errcode_t
3417 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3418 re_token_t *token)
3419 {
3420 unsigned char ch, delim = token->opr.c;
3421 int i = 0;
3422 if (re_string_eoi(regexp))
3423 return REG_EBRACK;
3424 for (;; ++i)
3425 {
3426 if (i >= BRACKET_NAME_BUF_SIZE)
3427 return REG_EBRACK;
3428 if (token->type == OP_OPEN_CHAR_CLASS)
3429 ch = re_string_fetch_byte_case (regexp);
3430 else
3431 ch = re_string_fetch_byte (regexp);
3432 if (re_string_eoi(regexp))
3433 return REG_EBRACK;
3434 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3435 break;
3436 elem->opr.name[i] = ch;
3437 }
3438 re_string_skip_bytes (regexp, 1);
3439 elem->opr.name[i] = '\0';
3440 switch (token->type)
3441 {
3442 case OP_OPEN_COLL_ELEM:
3443 elem->type = COLL_SYM;
3444 break;
3445 case OP_OPEN_EQUIV_CLASS:
3446 elem->type = EQUIV_CLASS;
3447 break;
3448 case OP_OPEN_CHAR_CLASS:
3449 elem->type = CHAR_CLASS;
3450 break;
3451 default:
3452 break;
3453 }
3454 return REG_NOERROR;
3455 }
3456
3457 /* Helper function for parse_bracket_exp.
3458 Build the equivalence class which is represented by NAME.
3459 The result are written to MBCSET and SBCSET.
3460 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3461 is a pointer argument since we may update it. */
3462
3463 static reg_errcode_t
3464 #ifdef RE_ENABLE_I18N
3465 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3466 Idx *equiv_class_alloc, const unsigned char *name)
3467 #else /* not RE_ENABLE_I18N */
3468 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3469 #endif /* not RE_ENABLE_I18N */
3470 {
3471 #ifdef _LIBC
3472 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3473 if (nrules != 0)
3474 {
3475 const int32_t *table, *indirect;
3476 const unsigned char *weights, *extra, *cp;
3477 unsigned char char_buf[2];
3478 int32_t idx1, idx2;
3479 unsigned int ch;
3480 size_t len;
3481 /* This #include defines a local function! */
3482 # include <locale/weight.h>
3483 /* Calculate the index for equivalence class. */
3484 cp = name;
3485 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3486 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3487 _NL_COLLATE_WEIGHTMB);
3488 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3489 _NL_COLLATE_EXTRAMB);
3490 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3491 _NL_COLLATE_INDIRECTMB);
3492 idx1 = findidx (&cp, -1);
3493 if (BE (idx1 == 0 || *cp != '\0', 0))
3494 /* This isn't a valid character. */
3495 return REG_ECOLLATE;
3496
3497 /* Build single byte matching table for this equivalence class. */
3498 len = weights[idx1 & 0xffffff];
3499 for (ch = 0; ch < SBC_MAX; ++ch)
3500 {
3501 char_buf[0] = ch;
3502 cp = char_buf;
3503 idx2 = findidx (&cp, 1);
3504 /*
3505 idx2 = table[ch];
3506 */
3507 if (idx2 == 0)
3508 /* This isn't a valid character. */
3509 continue;
3510 /* Compare only if the length matches and the collation rule
3511 index is the same. */
3512 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3513 {
3514 int cnt = 0;
3515
3516 while (cnt <= len &&
3517 weights[(idx1 & 0xffffff) + 1 + cnt]
3518 == weights[(idx2 & 0xffffff) + 1 + cnt])
3519 ++cnt;
3520
3521 if (cnt > len)
3522 bitset_set (sbcset, ch);
3523 }
3524 }
3525 /* Check whether the array has enough space. */
3526 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3527 {
3528 /* Not enough, realloc it. */
3529 /* +1 in case of mbcset->nequiv_classes is 0. */
3530 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3531 /* Use realloc since the array is NULL if *alloc == 0. */
3532 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3533 int32_t,
3534 new_equiv_class_alloc);
3535 if (BE (new_equiv_classes == NULL, 0))
3536 return REG_ESPACE;
3537 mbcset->equiv_classes = new_equiv_classes;
3538 *equiv_class_alloc = new_equiv_class_alloc;
3539 }
3540 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3541 }
3542 else
3543 #endif /* _LIBC */
3544 {
3545 if (BE (strlen ((const char *) name) != 1, 0))
3546 return REG_ECOLLATE;
3547 bitset_set (sbcset, *name);
3548 }
3549 return REG_NOERROR;
3550 }
3551
3552 /* Helper function for parse_bracket_exp.
3553 Build the character class which is represented by NAME.
3554 The result are written to MBCSET and SBCSET.
3555 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3556 is a pointer argument since we may update it. */
3557
3558 static reg_errcode_t
3559 #ifdef RE_ENABLE_I18N
3560 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3561 re_charset_t *mbcset, Idx *char_class_alloc,
3562 const char *class_name, reg_syntax_t syntax)
3563 #else /* not RE_ENABLE_I18N */
3564 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3565 const char *class_name, reg_syntax_t syntax)
3566 #endif /* not RE_ENABLE_I18N */
3567 {
3568 int i;
3569 const char *name = class_name;
3570
3571 /* In case of REG_ICASE "upper" and "lower" match the both of
3572 upper and lower cases. */
3573 if ((syntax & RE_ICASE)
3574 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3575 name = "alpha";
3576
3577 #ifdef RE_ENABLE_I18N
3578 /* Check the space of the arrays. */
3579 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3580 {
3581 /* Not enough, realloc it. */
3582 /* +1 in case of mbcset->nchar_classes is 0. */
3583 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3584 /* Use realloc since array is NULL if *alloc == 0. */
3585 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3586 new_char_class_alloc);
3587 if (BE (new_char_classes == NULL, 0))
3588 return REG_ESPACE;
3589 mbcset->char_classes = new_char_classes;
3590 *char_class_alloc = new_char_class_alloc;
3591 }
3592 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3593 #endif /* RE_ENABLE_I18N */
3594
3595 #define BUILD_CHARCLASS_LOOP(ctype_func) \
3596 do { \
3597 if (BE (trans != NULL, 0)) \
3598 { \
3599 for (i = 0; i < SBC_MAX; ++i) \
3600 if (ctype_func (i)) \
3601 bitset_set (sbcset, trans[i]); \
3602 } \
3603 else \
3604 { \
3605 for (i = 0; i < SBC_MAX; ++i) \
3606 if (ctype_func (i)) \
3607 bitset_set (sbcset, i); \
3608 } \
3609 } while (0)
3610
3611 if (strcmp (name, "alnum") == 0)
3612 BUILD_CHARCLASS_LOOP (isalnum);
3613 else if (strcmp (name, "cntrl") == 0)
3614 BUILD_CHARCLASS_LOOP (iscntrl);
3615 else if (strcmp (name, "lower") == 0)
3616 BUILD_CHARCLASS_LOOP (islower);
3617 else if (strcmp (name, "space") == 0)
3618 BUILD_CHARCLASS_LOOP (isspace);
3619 else if (strcmp (name, "alpha") == 0)
3620 BUILD_CHARCLASS_LOOP (isalpha);
3621 else if (strcmp (name, "digit") == 0)
3622 BUILD_CHARCLASS_LOOP (isdigit);
3623 else if (strcmp (name, "print") == 0)
3624 BUILD_CHARCLASS_LOOP (isprint);
3625 else if (strcmp (name, "upper") == 0)
3626 BUILD_CHARCLASS_LOOP (isupper);
3627 else if (strcmp (name, "blank") == 0)
3628 BUILD_CHARCLASS_LOOP (isblank);
3629 else if (strcmp (name, "graph") == 0)
3630 BUILD_CHARCLASS_LOOP (isgraph);
3631 else if (strcmp (name, "punct") == 0)
3632 BUILD_CHARCLASS_LOOP (ispunct);
3633 else if (strcmp (name, "xdigit") == 0)
3634 BUILD_CHARCLASS_LOOP (isxdigit);
3635 else
3636 return REG_ECTYPE;
3637
3638 return REG_NOERROR;
3639 }
3640
3641 static bin_tree_t *
3642 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3643 const char *class_name,
3644 const char *extra, bool non_match,
3645 reg_errcode_t *err)
3646 {
3647 re_bitset_ptr_t sbcset;
3648 #ifdef RE_ENABLE_I18N
3649 re_charset_t *mbcset;
3650 Idx alloc = 0;
3651 #endif /* not RE_ENABLE_I18N */
3652 reg_errcode_t ret;
3653 re_token_t br_token;
3654 bin_tree_t *tree;
3655
3656 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3657 #ifdef RE_ENABLE_I18N
3658 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3659 #endif /* RE_ENABLE_I18N */
3660
3661 #ifdef RE_ENABLE_I18N
3662 if (BE (sbcset == NULL || mbcset == NULL, 0))
3663 #else /* not RE_ENABLE_I18N */
3664 if (BE (sbcset == NULL, 0))
3665 #endif /* not RE_ENABLE_I18N */
3666 {
3667 *err = REG_ESPACE;
3668 return NULL;
3669 }
3670
3671 if (non_match)
3672 {
3673 #ifdef RE_ENABLE_I18N
3674 mbcset->non_match = 1;
3675 #endif /* not RE_ENABLE_I18N */
3676 }
3677
3678 /* We don't care the syntax in this case. */
3679 ret = build_charclass (trans, sbcset,
3680 #ifdef RE_ENABLE_I18N
3681 mbcset, &alloc,
3682 #endif /* RE_ENABLE_I18N */
3683 class_name, 0);
3684
3685 if (BE (ret != REG_NOERROR, 0))
3686 {
3687 re_free (sbcset);
3688 #ifdef RE_ENABLE_I18N
3689 free_charset (mbcset);
3690 #endif /* RE_ENABLE_I18N */
3691 *err = ret;
3692 return NULL;
3693 }
3694 /* \w match '_' also. */
3695 for (; *extra; extra++)
3696 bitset_set (sbcset, *extra);
3697
3698 /* If it is non-matching list. */
3699 if (non_match)
3700 bitset_not (sbcset);
3701
3702 #ifdef RE_ENABLE_I18N
3703 /* Ensure only single byte characters are set. */
3704 if (dfa->mb_cur_max > 1)
3705 bitset_mask (sbcset, dfa->sb_char);
3706 #endif
3707
3708 /* Build a tree for simple bracket. */
3709 br_token.type = SIMPLE_BRACKET;
3710 br_token.opr.sbcset = sbcset;
3711 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3712 if (BE (tree == NULL, 0))
3713 goto build_word_op_espace;
3714
3715 #ifdef RE_ENABLE_I18N
3716 if (dfa->mb_cur_max > 1)
3717 {
3718 bin_tree_t *mbc_tree;
3719 /* Build a tree for complex bracket. */
3720 br_token.type = COMPLEX_BRACKET;
3721 br_token.opr.mbcset = mbcset;
3722 dfa->has_mb_node = 1;
3723 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3724 if (BE (mbc_tree == NULL, 0))
3725 goto build_word_op_espace;
3726 /* Then join them by ALT node. */
3727 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3728 if (BE (mbc_tree != NULL, 1))
3729 return tree;
3730 }
3731 else
3732 {
3733 free_charset (mbcset);
3734 return tree;
3735 }
3736 #else /* not RE_ENABLE_I18N */
3737 return tree;
3738 #endif /* not RE_ENABLE_I18N */
3739
3740 build_word_op_espace:
3741 re_free (sbcset);
3742 #ifdef RE_ENABLE_I18N
3743 free_charset (mbcset);
3744 #endif /* RE_ENABLE_I18N */
3745 *err = REG_ESPACE;
3746 return NULL;
3747 }
3748
3749 /* This is intended for the expressions like "a{1,3}".
3750 Fetch a number from 'input', and return the number.
3751 Return REG_MISSING if the number field is empty like "{,1}".
3752 Return RE_DUP_MAX + 1 if the number field is too large.
3753 Return REG_ERROR if an error occurred. */
3754
3755 static Idx
3756 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3757 {
3758 Idx num = REG_MISSING;
3759 unsigned char c;
3760 while (1)
3761 {
3762 fetch_token (token, input, syntax);
3763 c = token->opr.c;
3764 if (BE (token->type == END_OF_RE, 0))
3765 return REG_ERROR;
3766 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3767 break;
3768 num = ((token->type != CHARACTER || c < '0' || '9' < c
3769 || num == REG_ERROR)
3770 ? REG_ERROR
3771 : num == REG_MISSING
3772 ? c - '0'
3773 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
3774 }
3775 return num;
3776 }
3777 \f
3778 #ifdef RE_ENABLE_I18N
3779 static void
3780 free_charset (re_charset_t *cset)
3781 {
3782 re_free (cset->mbchars);
3783 # ifdef _LIBC
3784 re_free (cset->coll_syms);
3785 re_free (cset->equiv_classes);
3786 re_free (cset->range_starts);
3787 re_free (cset->range_ends);
3788 # endif
3789 re_free (cset->char_classes);
3790 re_free (cset);
3791 }
3792 #endif /* RE_ENABLE_I18N */
3793 \f
3794 /* Functions for binary tree operation. */
3795
3796 /* Create a tree node. */
3797
3798 static bin_tree_t *
3799 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3800 re_token_type_t type)
3801 {
3802 re_token_t t;
3803 t.type = type;
3804 return create_token_tree (dfa, left, right, &t);
3805 }
3806
3807 static bin_tree_t *
3808 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3809 const re_token_t *token)
3810 {
3811 bin_tree_t *tree;
3812 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3813 {
3814 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3815
3816 if (storage == NULL)
3817 return NULL;
3818 storage->next = dfa->str_tree_storage;
3819 dfa->str_tree_storage = storage;
3820 dfa->str_tree_storage_idx = 0;
3821 }
3822 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3823
3824 tree->parent = NULL;
3825 tree->left = left;
3826 tree->right = right;
3827 tree->token = *token;
3828 tree->token.duplicated = 0;
3829 tree->token.opt_subexp = 0;
3830 tree->first = NULL;
3831 tree->next = NULL;
3832 tree->node_idx = REG_MISSING;
3833
3834 if (left != NULL)
3835 left->parent = tree;
3836 if (right != NULL)
3837 right->parent = tree;
3838 return tree;
3839 }
3840
3841 /* Mark the tree SRC as an optional subexpression.
3842 To be called from preorder or postorder. */
3843
3844 static reg_errcode_t
3845 mark_opt_subexp (void *extra, bin_tree_t *node)
3846 {
3847 Idx idx = (uintptr_t) extra;
3848 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849 node->token.opt_subexp = 1;
3850
3851 return REG_NOERROR;
3852 }
3853
3854 /* Free the allocated memory inside NODE. */
3855
3856 static void
3857 free_token (re_token_t *node)
3858 {
3859 #ifdef RE_ENABLE_I18N
3860 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861 free_charset (node->opr.mbcset);
3862 else
3863 #endif /* RE_ENABLE_I18N */
3864 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865 re_free (node->opr.sbcset);
3866 }
3867
3868 /* Worker function for tree walking. Free the allocated memory inside NODE
3869 and its children. */
3870
3871 static reg_errcode_t
3872 free_tree (void *extra, bin_tree_t *node)
3873 {
3874 free_token (&node->token);
3875 return REG_NOERROR;
3876 }
3877
3878
3879 /* Duplicate the node SRC, and return new node. This is a preorder
3880 visit similar to the one implemented by the generic visitor, but
3881 we need more infrastructure to maintain two parallel trees --- so,
3882 it's easier to duplicate. */
3883
3884 static bin_tree_t *
3885 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3886 {
3887 const bin_tree_t *node;
3888 bin_tree_t *dup_root;
3889 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3890
3891 for (node = root; ; )
3892 {
3893 /* Create a new tree and link it back to the current parent. */
3894 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895 if (*p_new == NULL)
3896 return NULL;
3897 (*p_new)->parent = dup_node;
3898 (*p_new)->token.duplicated = 1;
3899 dup_node = *p_new;
3900
3901 /* Go to the left node, or up and to the right. */
3902 if (node->left)
3903 {
3904 node = node->left;
3905 p_new = &dup_node->left;
3906 }
3907 else
3908 {
3909 const bin_tree_t *prev = NULL;
3910 while (node->right == prev || node->right == NULL)
3911 {
3912 prev = node;
3913 node = node->parent;
3914 dup_node = dup_node->parent;
3915 if (!node)
3916 return dup_root;
3917 }
3918 node = node->right;
3919 p_new = &dup_node->right;
3920 }
3921 }
3922 }