Array documentation fixes
[bpt/guile.git] / lib / regcomp.c
CommitLineData
eb4a14ed
LC
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2012 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License along
005de2e8 17 with this program; if not, see <http://www.gnu.org/licenses/>. */
eb4a14ed
LC
18
19static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
20 size_t length, reg_syntax_t syntax);
21static void re_compile_fastmap_iter (regex_t *bufp,
22 const re_dfastate_t *init_state,
23 char *fastmap);
24static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
25#ifdef RE_ENABLE_I18N
26static void free_charset (re_charset_t *cset);
27#endif /* RE_ENABLE_I18N */
28static void free_workarea_compile (regex_t *preg);
29static reg_errcode_t create_initial_state (re_dfa_t *dfa);
30#ifdef RE_ENABLE_I18N
31static void optimize_utf8 (re_dfa_t *dfa);
32#endif
33static reg_errcode_t analyze (regex_t *preg);
34static reg_errcode_t preorder (bin_tree_t *root,
35 reg_errcode_t (fn (void *, bin_tree_t *)),
36 void *extra);
37static reg_errcode_t postorder (bin_tree_t *root,
38 reg_errcode_t (fn (void *, bin_tree_t *)),
39 void *extra);
40static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
41static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
42static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
43 bin_tree_t *node);
44static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
45static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
46static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
47static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
48static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
49 unsigned int constraint);
50static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
51static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
52 Idx node, bool root);
53static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
54static Idx fetch_number (re_string_t *input, re_token_t *token,
55 reg_syntax_t syntax);
56static int peek_token (re_token_t *token, re_string_t *input,
57 reg_syntax_t syntax) internal_function;
58static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
59 reg_syntax_t syntax, reg_errcode_t *err);
60static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
61 re_token_t *token, reg_syntax_t syntax,
62 Idx nest, reg_errcode_t *err);
63static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
64 re_token_t *token, reg_syntax_t syntax,
65 Idx nest, reg_errcode_t *err);
66static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
67 re_token_t *token, reg_syntax_t syntax,
68 Idx nest, reg_errcode_t *err);
69static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
70 re_token_t *token, reg_syntax_t syntax,
71 Idx nest, reg_errcode_t *err);
72static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
73 re_dfa_t *dfa, re_token_t *token,
74 reg_syntax_t syntax, reg_errcode_t *err);
75static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
76 re_token_t *token, reg_syntax_t syntax,
77 reg_errcode_t *err);
78static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
79 re_string_t *regexp,
80 re_token_t *token, int token_len,
81 re_dfa_t *dfa,
82 reg_syntax_t syntax,
83 bool accept_hyphen);
84static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
85 re_string_t *regexp,
86 re_token_t *token);
87#ifdef RE_ENABLE_I18N
88static reg_errcode_t build_equiv_class (bitset_t sbcset,
89 re_charset_t *mbcset,
90 Idx *equiv_class_alloc,
91 const unsigned char *name);
92static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
93 bitset_t sbcset,
94 re_charset_t *mbcset,
95 Idx *char_class_alloc,
96 const unsigned char *class_name,
97 reg_syntax_t syntax);
98#else /* not RE_ENABLE_I18N */
99static reg_errcode_t build_equiv_class (bitset_t sbcset,
100 const unsigned char *name);
101static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
102 bitset_t sbcset,
103 const unsigned char *class_name,
104 reg_syntax_t syntax);
105#endif /* not RE_ENABLE_I18N */
106static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
107 RE_TRANSLATE_TYPE trans,
108 const unsigned char *class_name,
109 const unsigned char *extra,
110 bool non_match, reg_errcode_t *err);
111static bin_tree_t *create_tree (re_dfa_t *dfa,
112 bin_tree_t *left, bin_tree_t *right,
113 re_token_type_t type);
114static bin_tree_t *create_token_tree (re_dfa_t *dfa,
115 bin_tree_t *left, bin_tree_t *right,
116 const re_token_t *token);
117static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
118static void free_token (re_token_t *node);
119static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
120static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
121\f
122/* This table gives an error message for each of the error codes listed
123 in regex.h. Obviously the order here has to be same as there.
124 POSIX doesn't require that we do anything for REG_NOERROR,
125 but why not be nice? */
126
127static const char __re_error_msgid[] =
128 {
129#define REG_NOERROR_IDX 0
130 gettext_noop ("Success") /* REG_NOERROR */
131 "\0"
132#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
133 gettext_noop ("No match") /* REG_NOMATCH */
134 "\0"
135#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
136 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
137 "\0"
138#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
139 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
140 "\0"
141#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
142 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
143 "\0"
144#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
145 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
146 "\0"
147#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
148 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
149 "\0"
150#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
151 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
152 "\0"
153#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
154 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
155 "\0"
156#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
157 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
158 "\0"
159#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
160 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
161 "\0"
162#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
163 gettext_noop ("Invalid range end") /* REG_ERANGE */
164 "\0"
165#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
166 gettext_noop ("Memory exhausted") /* REG_ESPACE */
167 "\0"
168#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
169 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
170 "\0"
171#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
172 gettext_noop ("Premature end of regular expression") /* REG_EEND */
173 "\0"
174#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
175 gettext_noop ("Regular expression too big") /* REG_ESIZE */
176 "\0"
177#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
178 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
179 };
180
181static const size_t __re_error_msgid_idx[] =
182 {
183 REG_NOERROR_IDX,
184 REG_NOMATCH_IDX,
185 REG_BADPAT_IDX,
186 REG_ECOLLATE_IDX,
187 REG_ECTYPE_IDX,
188 REG_EESCAPE_IDX,
189 REG_ESUBREG_IDX,
190 REG_EBRACK_IDX,
191 REG_EPAREN_IDX,
192 REG_EBRACE_IDX,
193 REG_BADBR_IDX,
194 REG_ERANGE_IDX,
195 REG_ESPACE_IDX,
196 REG_BADRPT_IDX,
197 REG_EEND_IDX,
198 REG_ESIZE_IDX,
199 REG_ERPAREN_IDX
200 };
201\f
202/* Entry points for GNU code. */
203
204/* re_compile_pattern is the GNU regular expression compiler: it
205 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
206 Returns 0 if the pattern was valid, otherwise an error string.
207
208 Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
209 are set in BUFP on entry. */
210
211#ifdef _LIBC
212const char *
213re_compile_pattern (pattern, length, bufp)
214 const char *pattern;
215 size_t length;
216 struct re_pattern_buffer *bufp;
217#else /* size_t might promote */
218const char *
219re_compile_pattern (const char *pattern, size_t length,
220 struct re_pattern_buffer *bufp)
221#endif
222{
223 reg_errcode_t ret;
224
225 /* And GNU code determines whether or not to get register information
226 by passing null for the REGS argument to re_match, etc., not by
227 setting no_sub, unless RE_NO_SUB is set. */
228 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
229
230 /* Match anchors at newline. */
231 bufp->newline_anchor = 1;
232
233 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
234
235 if (!ret)
236 return NULL;
237 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
238}
239#ifdef _LIBC
240weak_alias (__re_compile_pattern, re_compile_pattern)
241#endif
242
243/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
244 also be assigned to arbitrarily: each pattern buffer stores its own
245 syntax, so it can be changed between regex compilations. */
246/* This has no initializer because initialized variables in Emacs
247 become read-only after dumping. */
248reg_syntax_t re_syntax_options;
249
250
251/* Specify the precise syntax of regexps for compilation. This provides
252 for compatibility for various utilities which historically have
253 different, incompatible syntaxes.
254
255 The argument SYNTAX is a bit mask comprised of the various bits
256 defined in regex.h. We return the old syntax. */
257
258reg_syntax_t
259re_set_syntax (syntax)
260 reg_syntax_t syntax;
261{
262 reg_syntax_t ret = re_syntax_options;
263
264 re_syntax_options = syntax;
265 return ret;
266}
267#ifdef _LIBC
268weak_alias (__re_set_syntax, re_set_syntax)
269#endif
270
271int
272re_compile_fastmap (bufp)
273 struct re_pattern_buffer *bufp;
274{
005de2e8 275 re_dfa_t *dfa = bufp->buffer;
eb4a14ed
LC
276 char *fastmap = bufp->fastmap;
277
278 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
279 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
280 if (dfa->init_state != dfa->init_state_word)
281 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
282 if (dfa->init_state != dfa->init_state_nl)
283 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
284 if (dfa->init_state != dfa->init_state_begbuf)
285 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
286 bufp->fastmap_accurate = 1;
287 return 0;
288}
289#ifdef _LIBC
290weak_alias (__re_compile_fastmap, re_compile_fastmap)
291#endif
292
293static inline void
294__attribute ((always_inline))
295re_set_fastmap (char *fastmap, bool icase, int ch)
296{
297 fastmap[ch] = 1;
298 if (icase)
299 fastmap[tolower (ch)] = 1;
300}
301
302/* Helper function for re_compile_fastmap.
303 Compile fastmap for the initial_state INIT_STATE. */
304
305static void
306re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
307 char *fastmap)
308{
005de2e8 309 re_dfa_t *dfa = bufp->buffer;
eb4a14ed
LC
310 Idx node_cnt;
311 bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
312 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
313 {
314 Idx node = init_state->nodes.elems[node_cnt];
315 re_token_type_t type = dfa->nodes[node].type;
316
317 if (type == CHARACTER)
318 {
319 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
320#ifdef RE_ENABLE_I18N
321 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
322 {
323 unsigned char buf[MB_LEN_MAX];
324 unsigned char *p;
325 wchar_t wc;
326 mbstate_t state;
327
328 p = buf;
329 *p++ = dfa->nodes[node].opr.c;
330 while (++node < dfa->nodes_len
331 && dfa->nodes[node].type == CHARACTER
332 && dfa->nodes[node].mb_partial)
333 *p++ = dfa->nodes[node].opr.c;
334 memset (&state, '\0', sizeof (state));
335 if (__mbrtowc (&wc, (const char *) buf, p - buf,
336 &state) == p - buf
337 && (__wcrtomb ((char *) buf, towlower (wc), &state)
338 != (size_t) -1))
339 re_set_fastmap (fastmap, false, buf[0]);
340 }
341#endif
342 }
343 else if (type == SIMPLE_BRACKET)
344 {
345 int i, ch;
346 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
347 {
348 int j;
349 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
350 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
351 if (w & ((bitset_word_t) 1 << j))
352 re_set_fastmap (fastmap, icase, ch);
353 }
354 }
355#ifdef RE_ENABLE_I18N
356 else if (type == COMPLEX_BRACKET)
357 {
358 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
359 Idx i;
360
361# ifdef _LIBC
362 /* See if we have to try all bytes which start multiple collation
363 elements.
364 e.g. In da_DK, we want to catch 'a' since "aa" is a valid
365 collation element, and don't catch 'b' since 'b' is
366 the only collation element which starts from 'b' (and
367 it is caught by SIMPLE_BRACKET). */
368 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
369 && (cset->ncoll_syms || cset->nranges))
370 {
371 const int32_t *table = (const int32_t *)
372 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
373 for (i = 0; i < SBC_MAX; ++i)
374 if (table[i] < 0)
375 re_set_fastmap (fastmap, icase, i);
376 }
377# endif /* _LIBC */
378
379 /* See if we have to start the match at all multibyte characters,
380 i.e. where we would not find an invalid sequence. This only
381 applies to multibyte character sets; for single byte character
382 sets, the SIMPLE_BRACKET again suffices. */
383 if (dfa->mb_cur_max > 1
384 && (cset->nchar_classes || cset->non_match || cset->nranges
385# ifdef _LIBC
386 || cset->nequiv_classes
387# endif /* _LIBC */
388 ))
389 {
390 unsigned char c = 0;
391 do
392 {
393 mbstate_t mbs;
394 memset (&mbs, 0, sizeof (mbs));
395 if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
396 re_set_fastmap (fastmap, false, (int) c);
397 }
398 while (++c != 0);
399 }
400
401 else
402 {
403 /* ... Else catch all bytes which can start the mbchars. */
404 for (i = 0; i < cset->nmbchars; ++i)
405 {
406 char buf[256];
407 mbstate_t state;
408 memset (&state, '\0', sizeof (state));
409 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
410 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
411 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
412 {
413 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
414 != (size_t) -1)
415 re_set_fastmap (fastmap, false, *(unsigned char *) buf);
416 }
417 }
418 }
419 }
420#endif /* RE_ENABLE_I18N */
421 else if (type == OP_PERIOD
422#ifdef RE_ENABLE_I18N
423 || type == OP_UTF8_PERIOD
424#endif /* RE_ENABLE_I18N */
425 || type == END_OF_RE)
426 {
427 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
428 if (type == END_OF_RE)
429 bufp->can_be_null = 1;
430 return;
431 }
432 }
433}
434\f
435/* Entry point for POSIX code. */
436/* regcomp takes a regular expression as a string and compiles it.
437
438 PREG is a regex_t *. We do not expect any fields to be initialized,
439 since POSIX says we shouldn't. Thus, we set
440
441 'buffer' to the compiled pattern;
442 'used' to the length of the compiled pattern;
443 'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
444 REG_EXTENDED bit in CFLAGS is set; otherwise, to
445 RE_SYNTAX_POSIX_BASIC;
446 'newline_anchor' to REG_NEWLINE being set in CFLAGS;
447 'fastmap' to an allocated space for the fastmap;
448 'fastmap_accurate' to zero;
449 're_nsub' to the number of subexpressions in PATTERN.
450
451 PATTERN is the address of the pattern string.
452
453 CFLAGS is a series of bits which affect compilation.
454
455 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
456 use POSIX basic syntax.
457
458 If REG_NEWLINE is set, then . and [^...] don't match newline.
459 Also, regexec will try a match beginning after every newline.
460
461 If REG_ICASE is set, then we considers upper- and lowercase
462 versions of letters to be equivalent when matching.
463
464 If REG_NOSUB is set, then when PREG is passed to regexec, that
465 routine will report only success or failure, and nothing about the
466 registers.
467
468 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
469 the return codes and their meanings.) */
470
471int
472regcomp (preg, pattern, cflags)
473 regex_t *_Restrict_ preg;
474 const char *_Restrict_ pattern;
475 int cflags;
476{
477 reg_errcode_t ret;
478 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
479 : RE_SYNTAX_POSIX_BASIC);
480
481 preg->buffer = NULL;
482 preg->allocated = 0;
483 preg->used = 0;
484
485 /* Try to allocate space for the fastmap. */
486 preg->fastmap = re_malloc (char, SBC_MAX);
487 if (BE (preg->fastmap == NULL, 0))
488 return REG_ESPACE;
489
490 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
491
492 /* If REG_NEWLINE is set, newlines are treated differently. */
493 if (cflags & REG_NEWLINE)
494 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
495 syntax &= ~RE_DOT_NEWLINE;
496 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
497 /* It also changes the matching behavior. */
498 preg->newline_anchor = 1;
499 }
500 else
501 preg->newline_anchor = 0;
502 preg->no_sub = !!(cflags & REG_NOSUB);
503 preg->translate = NULL;
504
505 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
506
507 /* POSIX doesn't distinguish between an unmatched open-group and an
508 unmatched close-group: both are REG_EPAREN. */
509 if (ret == REG_ERPAREN)
510 ret = REG_EPAREN;
511
512 /* We have already checked preg->fastmap != NULL. */
513 if (BE (ret == REG_NOERROR, 1))
514 /* Compute the fastmap now, since regexec cannot modify the pattern
515 buffer. This function never fails in this implementation. */
516 (void) re_compile_fastmap (preg);
517 else
518 {
519 /* Some error occurred while compiling the expression. */
520 re_free (preg->fastmap);
521 preg->fastmap = NULL;
522 }
523
524 return (int) ret;
525}
526#ifdef _LIBC
527weak_alias (__regcomp, regcomp)
528#endif
529
530/* Returns a message corresponding to an error code, ERRCODE, returned
531 from either regcomp or regexec. We don't use PREG here. */
532
533#ifdef _LIBC
534size_t
535regerror (errcode, preg, errbuf, errbuf_size)
536 int errcode;
537 const regex_t *_Restrict_ preg;
538 char *_Restrict_ errbuf;
539 size_t errbuf_size;
540#else /* size_t might promote */
541size_t
542regerror (int errcode, const regex_t *_Restrict_ preg,
543 char *_Restrict_ errbuf, size_t errbuf_size)
544#endif
545{
546 const char *msg;
547 size_t msg_size;
548
549 if (BE (errcode < 0
550 || errcode >= (int) (sizeof (__re_error_msgid_idx)
551 / sizeof (__re_error_msgid_idx[0])), 0))
552 /* Only error codes returned by the rest of the code should be passed
553 to this routine. If we are given anything else, or if other regex
554 code generates an invalid error code, then the program has a bug.
555 Dump core so we can fix it. */
556 abort ();
557
558 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
559
560 msg_size = strlen (msg) + 1; /* Includes the null. */
561
562 if (BE (errbuf_size != 0, 1))
563 {
564 size_t cpy_size = msg_size;
565 if (BE (msg_size > errbuf_size, 0))
566 {
567 cpy_size = errbuf_size - 1;
568 errbuf[cpy_size] = '\0';
569 }
570 memcpy (errbuf, msg, cpy_size);
571 }
572
573 return msg_size;
574}
575#ifdef _LIBC
576weak_alias (__regerror, regerror)
577#endif
578
579
580#ifdef RE_ENABLE_I18N
581/* This static array is used for the map to single-byte characters when
582 UTF-8 is used. Otherwise we would allocate memory just to initialize
583 it the same all the time. UTF-8 is the preferred encoding so this is
584 a worthwhile optimization. */
585static const bitset_t utf8_sb_map =
586{
587 /* Set the first 128 bits. */
005de2e8
LC
588# ifdef __GNUC__
589 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
590# else
591# if 4 * BITSET_WORD_BITS < ASCII_CHARS
592# error "bitset_word_t is narrower than 32 bits"
593# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
eb4a14ed 594 BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
005de2e8 595# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
eb4a14ed 596 BITSET_WORD_MAX, BITSET_WORD_MAX,
005de2e8 597# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
eb4a14ed 598 BITSET_WORD_MAX,
005de2e8 599# endif
eb4a14ed
LC
600 (BITSET_WORD_MAX
601 >> (SBC_MAX % BITSET_WORD_BITS == 0
602 ? 0
603 : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
005de2e8 604# endif
eb4a14ed
LC
605};
606#endif
607
608
609static void
610free_dfa_content (re_dfa_t *dfa)
611{
612 Idx i, j;
613
614 if (dfa->nodes)
615 for (i = 0; i < dfa->nodes_len; ++i)
616 free_token (dfa->nodes + i);
617 re_free (dfa->nexts);
618 for (i = 0; i < dfa->nodes_len; ++i)
619 {
620 if (dfa->eclosures != NULL)
621 re_node_set_free (dfa->eclosures + i);
622 if (dfa->inveclosures != NULL)
623 re_node_set_free (dfa->inveclosures + i);
624 if (dfa->edests != NULL)
625 re_node_set_free (dfa->edests + i);
626 }
627 re_free (dfa->edests);
628 re_free (dfa->eclosures);
629 re_free (dfa->inveclosures);
630 re_free (dfa->nodes);
631
632 if (dfa->state_table)
633 for (i = 0; i <= dfa->state_hash_mask; ++i)
634 {
635 struct re_state_table_entry *entry = dfa->state_table + i;
636 for (j = 0; j < entry->num; ++j)
637 {
638 re_dfastate_t *state = entry->array[j];
639 free_state (state);
640 }
641 re_free (entry->array);
642 }
643 re_free (dfa->state_table);
644#ifdef RE_ENABLE_I18N
645 if (dfa->sb_char != utf8_sb_map)
646 re_free (dfa->sb_char);
647#endif
648 re_free (dfa->subexp_map);
649#ifdef DEBUG
650 re_free (dfa->re_str);
651#endif
652
653 re_free (dfa);
654}
655
656
657/* Free dynamically allocated space used by PREG. */
658
659void
660regfree (preg)
661 regex_t *preg;
662{
005de2e8 663 re_dfa_t *dfa = preg->buffer;
eb4a14ed
LC
664 if (BE (dfa != NULL, 1))
665 free_dfa_content (dfa);
666 preg->buffer = NULL;
667 preg->allocated = 0;
668
669 re_free (preg->fastmap);
670 preg->fastmap = NULL;
671
672 re_free (preg->translate);
673 preg->translate = NULL;
674}
675#ifdef _LIBC
676weak_alias (__regfree, regfree)
677#endif
678\f
679/* Entry points compatible with 4.2 BSD regex library. We don't define
680 them unless specifically requested. */
681
682#if defined _REGEX_RE_COMP || defined _LIBC
683
684/* BSD has one and only one pattern buffer. */
685static struct re_pattern_buffer re_comp_buf;
686
687char *
688# ifdef _LIBC
689/* Make these definitions weak in libc, so POSIX programs can redefine
690 these names if they don't use our functions, and still use
691 regcomp/regexec above without link errors. */
692weak_function
693# endif
694re_comp (s)
695 const char *s;
696{
697 reg_errcode_t ret;
698 char *fastmap;
699
700 if (!s)
701 {
702 if (!re_comp_buf.buffer)
703 return gettext ("No previous regular expression");
704 return 0;
705 }
706
707 if (re_comp_buf.buffer)
708 {
709 fastmap = re_comp_buf.fastmap;
710 re_comp_buf.fastmap = NULL;
711 __regfree (&re_comp_buf);
712 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
713 re_comp_buf.fastmap = fastmap;
714 }
715
716 if (re_comp_buf.fastmap == NULL)
717 {
718 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
719 if (re_comp_buf.fastmap == NULL)
720 return (char *) gettext (__re_error_msgid
721 + __re_error_msgid_idx[(int) REG_ESPACE]);
722 }
723
724 /* Since 're_exec' always passes NULL for the 'regs' argument, we
725 don't need to initialize the pattern buffer fields which affect it. */
726
727 /* Match anchors at newlines. */
728 re_comp_buf.newline_anchor = 1;
729
730 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
731
732 if (!ret)
733 return NULL;
734
735 /* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
736 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
737}
738
739#ifdef _LIBC
740libc_freeres_fn (free_mem)
741{
742 __regfree (&re_comp_buf);
743}
744#endif
745
746#endif /* _REGEX_RE_COMP */
747\f
748/* Internal entry point.
749 Compile the regular expression PATTERN, whose length is LENGTH.
750 SYNTAX indicate regular expression's syntax. */
751
752static reg_errcode_t
753re_compile_internal (regex_t *preg, const char * pattern, size_t length,
754 reg_syntax_t syntax)
755{
756 reg_errcode_t err = REG_NOERROR;
757 re_dfa_t *dfa;
758 re_string_t regexp;
759
760 /* Initialize the pattern buffer. */
761 preg->fastmap_accurate = 0;
762 preg->syntax = syntax;
763 preg->not_bol = preg->not_eol = 0;
764 preg->used = 0;
765 preg->re_nsub = 0;
766 preg->can_be_null = 0;
767 preg->regs_allocated = REGS_UNALLOCATED;
768
769 /* Initialize the dfa. */
005de2e8 770 dfa = preg->buffer;
eb4a14ed
LC
771 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
772 {
773 /* If zero allocated, but buffer is non-null, try to realloc
774 enough space. This loses if buffer's address is bogus, but
775 that is the user's responsibility. If ->buffer is NULL this
776 is a simple allocation. */
777 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
778 if (dfa == NULL)
779 return REG_ESPACE;
780 preg->allocated = sizeof (re_dfa_t);
005de2e8 781 preg->buffer = dfa;
eb4a14ed
LC
782 }
783 preg->used = sizeof (re_dfa_t);
784
785 err = init_dfa (dfa, length);
786 if (BE (err != REG_NOERROR, 0))
787 {
788 free_dfa_content (dfa);
789 preg->buffer = NULL;
790 preg->allocated = 0;
791 return err;
792 }
793#ifdef DEBUG
794 /* Note: length+1 will not overflow since it is checked in init_dfa. */
795 dfa->re_str = re_malloc (char, length + 1);
796 strncpy (dfa->re_str, pattern, length + 1);
797#endif
798
799 __libc_lock_init (dfa->lock);
800
801 err = re_string_construct (&regexp, pattern, length, preg->translate,
802 (syntax & RE_ICASE) != 0, dfa);
803 if (BE (err != REG_NOERROR, 0))
804 {
805 re_compile_internal_free_return:
806 free_workarea_compile (preg);
807 re_string_destruct (&regexp);
808 free_dfa_content (dfa);
809 preg->buffer = NULL;
810 preg->allocated = 0;
811 return err;
812 }
813
814 /* Parse the regular expression, and build a structure tree. */
815 preg->re_nsub = 0;
816 dfa->str_tree = parse (&regexp, preg, syntax, &err);
817 if (BE (dfa->str_tree == NULL, 0))
818 goto re_compile_internal_free_return;
819
820 /* Analyze the tree and create the nfa. */
821 err = analyze (preg);
822 if (BE (err != REG_NOERROR, 0))
823 goto re_compile_internal_free_return;
824
825#ifdef RE_ENABLE_I18N
826 /* If possible, do searching in single byte encoding to speed things up. */
827 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
828 optimize_utf8 (dfa);
829#endif
830
831 /* Then create the initial state of the dfa. */
832 err = create_initial_state (dfa);
833
834 /* Release work areas. */
835 free_workarea_compile (preg);
836 re_string_destruct (&regexp);
837
838 if (BE (err != REG_NOERROR, 0))
839 {
840 free_dfa_content (dfa);
841 preg->buffer = NULL;
842 preg->allocated = 0;
843 }
844
845 return err;
846}
847
848/* Initialize DFA. We use the length of the regular expression PAT_LEN
849 as the initial length of some arrays. */
850
851static reg_errcode_t
852init_dfa (re_dfa_t *dfa, size_t pat_len)
853{
854 __re_size_t table_size;
855#ifndef _LIBC
005de2e8 856 const char *codeset_name;
eb4a14ed
LC
857#endif
858#ifdef RE_ENABLE_I18N
859 size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
860#else
861 size_t max_i18n_object_size = 0;
862#endif
863 size_t max_object_size =
864 MAX (sizeof (struct re_state_table_entry),
865 MAX (sizeof (re_token_t),
866 MAX (sizeof (re_node_set),
867 MAX (sizeof (regmatch_t),
868 max_i18n_object_size))));
869
870 memset (dfa, '\0', sizeof (re_dfa_t));
871
872 /* Force allocation of str_tree_storage the first time. */
873 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
874
875 /* Avoid overflows. The extra "/ 2" is for the table_size doubling
876 calculation below, and for similar doubling calculations
877 elsewhere. And it's <= rather than <, because some of the
878 doubling calculations add 1 afterwards. */
005de2e8 879 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
eb4a14ed
LC
880 return REG_ESPACE;
881
882 dfa->nodes_alloc = pat_len + 1;
883 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
884
885 /* table_size = 2 ^ ceil(log pat_len) */
886 for (table_size = 1; ; table_size <<= 1)
887 if (table_size > pat_len)
888 break;
889
890 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
891 dfa->state_hash_mask = table_size - 1;
892
893 dfa->mb_cur_max = MB_CUR_MAX;
894#ifdef _LIBC
895 if (dfa->mb_cur_max == 6
896 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
897 dfa->is_utf8 = 1;
898 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
899 != 0);
900#else
901 codeset_name = nl_langinfo (CODESET);
005de2e8
LC
902 if ((codeset_name[0] == 'U' || codeset_name[0] == 'u')
903 && (codeset_name[1] == 'T' || codeset_name[1] == 't')
904 && (codeset_name[2] == 'F' || codeset_name[2] == 'f')
905 && strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
eb4a14ed
LC
906 dfa->is_utf8 = 1;
907
908 /* We check exhaustively in the loop below if this charset is a
909 superset of ASCII. */
910 dfa->map_notascii = 0;
911#endif
912
913#ifdef RE_ENABLE_I18N
914 if (dfa->mb_cur_max > 1)
915 {
916 if (dfa->is_utf8)
917 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
918 else
919 {
920 int i, j, ch;
921
922 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
923 if (BE (dfa->sb_char == NULL, 0))
924 return REG_ESPACE;
925
926 /* Set the bits corresponding to single byte chars. */
927 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
928 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
929 {
930 wint_t wch = __btowc (ch);
931 if (wch != WEOF)
932 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
933# ifndef _LIBC
934 if (isascii (ch) && wch != ch)
935 dfa->map_notascii = 1;
936# endif
937 }
938 }
939 }
940#endif
941
942 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
943 return REG_ESPACE;
944 return REG_NOERROR;
945}
946
947/* Initialize WORD_CHAR table, which indicate which character is
948 "word". In this case "word" means that it is the word construction
949 character used by some operators like "\<", "\>", etc. */
950
951static void
952internal_function
953init_word_char (re_dfa_t *dfa)
954{
eb4a14ed 955 dfa->word_ops_used = 1;
005de2e8
LC
956 int i = 0;
957 int j;
958 int ch = 0;
959 if (BE (dfa->map_notascii == 0, 1))
960 {
961 bitset_word_t bits0 = 0x00000000;
962 bitset_word_t bits1 = 0x03ff0000;
963 bitset_word_t bits2 = 0x87fffffe;
964 bitset_word_t bits3 = 0x07fffffe;
965 if (BITSET_WORD_BITS == 64)
966 {
967 dfa->word_char[0] = bits1 << 31 << 1 | bits0;
968 dfa->word_char[1] = bits3 << 31 << 1 | bits2;
969 i = 2;
970 }
971 else if (BITSET_WORD_BITS == 32)
972 {
973 dfa->word_char[0] = bits0;
974 dfa->word_char[1] = bits1;
975 dfa->word_char[2] = bits2;
976 dfa->word_char[3] = bits3;
977 i = 4;
978 }
979 else
980 goto general_case;
981 ch = 128;
982
983 if (BE (dfa->is_utf8, 1))
984 {
985 memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
986 return;
987 }
988 }
989
990 general_case:
991 for (; i < BITSET_WORDS; ++i)
eb4a14ed
LC
992 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
993 if (isalnum (ch) || ch == '_')
994 dfa->word_char[i] |= (bitset_word_t) 1 << j;
995}
996
997/* Free the work area which are only used while compiling. */
998
999static void
1000free_workarea_compile (regex_t *preg)
1001{
005de2e8 1002 re_dfa_t *dfa = preg->buffer;
eb4a14ed
LC
1003 bin_tree_storage_t *storage, *next;
1004 for (storage = dfa->str_tree_storage; storage; storage = next)
1005 {
1006 next = storage->next;
1007 re_free (storage);
1008 }
1009 dfa->str_tree_storage = NULL;
1010 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
1011 dfa->str_tree = NULL;
1012 re_free (dfa->org_indices);
1013 dfa->org_indices = NULL;
1014}
1015
1016/* Create initial states for all contexts. */
1017
1018static reg_errcode_t
1019create_initial_state (re_dfa_t *dfa)
1020{
1021 Idx first, i;
1022 reg_errcode_t err;
1023 re_node_set init_nodes;
1024
1025 /* Initial states have the epsilon closure of the node which is
1026 the first node of the regular expression. */
1027 first = dfa->str_tree->first->node_idx;
1028 dfa->init_node = first;
1029 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1030 if (BE (err != REG_NOERROR, 0))
1031 return err;
1032
1033 /* The back-references which are in initial states can epsilon transit,
1034 since in this case all of the subexpressions can be null.
1035 Then we add epsilon closures of the nodes which are the next nodes of
1036 the back-references. */
1037 if (dfa->nbackref > 0)
1038 for (i = 0; i < init_nodes.nelem; ++i)
1039 {
1040 Idx node_idx = init_nodes.elems[i];
1041 re_token_type_t type = dfa->nodes[node_idx].type;
1042
1043 Idx clexp_idx;
1044 if (type != OP_BACK_REF)
1045 continue;
1046 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1047 {
1048 re_token_t *clexp_node;
1049 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1050 if (clexp_node->type == OP_CLOSE_SUBEXP
1051 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1052 break;
1053 }
1054 if (clexp_idx == init_nodes.nelem)
1055 continue;
1056
1057 if (type == OP_BACK_REF)
1058 {
1059 Idx dest_idx = dfa->edests[node_idx].elems[0];
1060 if (!re_node_set_contains (&init_nodes, dest_idx))
1061 {
1062 reg_errcode_t merge_err
1063 = re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1064 if (merge_err != REG_NOERROR)
1065 return merge_err;
1066 i = 0;
1067 }
1068 }
1069 }
1070
1071 /* It must be the first time to invoke acquire_state. */
1072 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1073 /* We don't check ERR here, since the initial state must not be NULL. */
1074 if (BE (dfa->init_state == NULL, 0))
1075 return err;
1076 if (dfa->init_state->has_constraint)
1077 {
1078 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1079 CONTEXT_WORD);
1080 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1081 CONTEXT_NEWLINE);
1082 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1083 &init_nodes,
1084 CONTEXT_NEWLINE
1085 | CONTEXT_BEGBUF);
1086 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1087 || dfa->init_state_begbuf == NULL, 0))
1088 return err;
1089 }
1090 else
1091 dfa->init_state_word = dfa->init_state_nl
1092 = dfa->init_state_begbuf = dfa->init_state;
1093
1094 re_node_set_free (&init_nodes);
1095 return REG_NOERROR;
1096}
1097\f
1098#ifdef RE_ENABLE_I18N
1099/* If it is possible to do searching in single byte encoding instead of UTF-8
1100 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1101 DFA nodes where needed. */
1102
1103static void
1104optimize_utf8 (re_dfa_t *dfa)
1105{
1106 Idx node;
1107 int i;
1108 bool mb_chars = false;
1109 bool has_period = false;
1110
1111 for (node = 0; node < dfa->nodes_len; ++node)
1112 switch (dfa->nodes[node].type)
1113 {
1114 case CHARACTER:
1115 if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1116 mb_chars = true;
1117 break;
1118 case ANCHOR:
1119 switch (dfa->nodes[node].opr.ctx_type)
1120 {
1121 case LINE_FIRST:
1122 case LINE_LAST:
1123 case BUF_FIRST:
1124 case BUF_LAST:
1125 break;
1126 default:
1127 /* Word anchors etc. cannot be handled. It's okay to test
1128 opr.ctx_type since constraints (for all DFA nodes) are
1129 created by ORing one or more opr.ctx_type values. */
1130 return;
1131 }
1132 break;
1133 case OP_PERIOD:
1134 has_period = true;
1135 break;
1136 case OP_BACK_REF:
1137 case OP_ALT:
1138 case END_OF_RE:
1139 case OP_DUP_ASTERISK:
1140 case OP_OPEN_SUBEXP:
1141 case OP_CLOSE_SUBEXP:
1142 break;
1143 case COMPLEX_BRACKET:
1144 return;
1145 case SIMPLE_BRACKET:
1146 /* Just double check. */
1147 {
1148 int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1149 ? 0
1150 : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1151 for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1152 {
1153 if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1154 return;
1155 rshift = 0;
1156 }
1157 }
1158 break;
1159 default:
1160 abort ();
1161 }
1162
1163 if (mb_chars || has_period)
1164 for (node = 0; node < dfa->nodes_len; ++node)
1165 {
1166 if (dfa->nodes[node].type == CHARACTER
1167 && dfa->nodes[node].opr.c >= ASCII_CHARS)
1168 dfa->nodes[node].mb_partial = 0;
1169 else if (dfa->nodes[node].type == OP_PERIOD)
1170 dfa->nodes[node].type = OP_UTF8_PERIOD;
1171 }
1172
1173 /* The search can be in single byte locale. */
1174 dfa->mb_cur_max = 1;
1175 dfa->is_utf8 = 0;
1176 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1177}
1178#endif
1179\f
1180/* Analyze the structure tree, and calculate "first", "next", "edest",
1181 "eclosure", and "inveclosure". */
1182
1183static reg_errcode_t
1184analyze (regex_t *preg)
1185{
005de2e8 1186 re_dfa_t *dfa = preg->buffer;
eb4a14ed
LC
1187 reg_errcode_t ret;
1188
1189 /* Allocate arrays. */
1190 dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1191 dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1192 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1193 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1194 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1195 || dfa->eclosures == NULL, 0))
1196 return REG_ESPACE;
1197
1198 dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1199 if (dfa->subexp_map != NULL)
1200 {
1201 Idx i;
1202 for (i = 0; i < preg->re_nsub; i++)
1203 dfa->subexp_map[i] = i;
1204 preorder (dfa->str_tree, optimize_subexps, dfa);
1205 for (i = 0; i < preg->re_nsub; i++)
1206 if (dfa->subexp_map[i] != i)
1207 break;
1208 if (i == preg->re_nsub)
1209 {
1210 free (dfa->subexp_map);
1211 dfa->subexp_map = NULL;
1212 }
1213 }
1214
1215 ret = postorder (dfa->str_tree, lower_subexps, preg);
1216 if (BE (ret != REG_NOERROR, 0))
1217 return ret;
1218 ret = postorder (dfa->str_tree, calc_first, dfa);
1219 if (BE (ret != REG_NOERROR, 0))
1220 return ret;
1221 preorder (dfa->str_tree, calc_next, dfa);
1222 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1223 if (BE (ret != REG_NOERROR, 0))
1224 return ret;
1225 ret = calc_eclosure (dfa);
1226 if (BE (ret != REG_NOERROR, 0))
1227 return ret;
1228
1229 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1230 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1231 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1232 || dfa->nbackref)
1233 {
1234 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1235 if (BE (dfa->inveclosures == NULL, 0))
1236 return REG_ESPACE;
1237 ret = calc_inveclosure (dfa);
1238 }
1239
1240 return ret;
1241}
1242
1243/* Our parse trees are very unbalanced, so we cannot use a stack to
1244 implement parse tree visits. Instead, we use parent pointers and
1245 some hairy code in these two functions. */
1246static reg_errcode_t
1247postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1248 void *extra)
1249{
1250 bin_tree_t *node, *prev;
1251
1252 for (node = root; ; )
1253 {
1254 /* Descend down the tree, preferably to the left (or to the right
1255 if that's the only child). */
1256 while (node->left || node->right)
1257 if (node->left)
1258 node = node->left;
1259 else
1260 node = node->right;
1261
1262 do
1263 {
1264 reg_errcode_t err = fn (extra, node);
1265 if (BE (err != REG_NOERROR, 0))
1266 return err;
1267 if (node->parent == NULL)
1268 return REG_NOERROR;
1269 prev = node;
1270 node = node->parent;
1271 }
1272 /* Go up while we have a node that is reached from the right. */
1273 while (node->right == prev || node->right == NULL);
1274 node = node->right;
1275 }
1276}
1277
1278static reg_errcode_t
1279preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1280 void *extra)
1281{
1282 bin_tree_t *node;
1283
1284 for (node = root; ; )
1285 {
1286 reg_errcode_t err = fn (extra, node);
1287 if (BE (err != REG_NOERROR, 0))
1288 return err;
1289
1290 /* Go to the left node, or up and to the right. */
1291 if (node->left)
1292 node = node->left;
1293 else
1294 {
1295 bin_tree_t *prev = NULL;
1296 while (node->right == prev || node->right == NULL)
1297 {
1298 prev = node;
1299 node = node->parent;
1300 if (!node)
1301 return REG_NOERROR;
1302 }
1303 node = node->right;
1304 }
1305 }
1306}
1307
1308/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1309 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1310 backreferences as well. Requires a preorder visit. */
1311static reg_errcode_t
1312optimize_subexps (void *extra, bin_tree_t *node)
1313{
1314 re_dfa_t *dfa = (re_dfa_t *) extra;
1315
1316 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1317 {
1318 int idx = node->token.opr.idx;
1319 node->token.opr.idx = dfa->subexp_map[idx];
1320 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1321 }
1322
1323 else if (node->token.type == SUBEXP
1324 && node->left && node->left->token.type == SUBEXP)
1325 {
1326 Idx other_idx = node->left->token.opr.idx;
1327
1328 node->left = node->left->left;
1329 if (node->left)
1330 node->left->parent = node;
1331
1332 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1333 if (other_idx < BITSET_WORD_BITS)
1334 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1335 }
1336
1337 return REG_NOERROR;
1338}
1339
1340/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1341 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1342static reg_errcode_t
1343lower_subexps (void *extra, bin_tree_t *node)
1344{
1345 regex_t *preg = (regex_t *) extra;
1346 reg_errcode_t err = REG_NOERROR;
1347
1348 if (node->left && node->left->token.type == SUBEXP)
1349 {
1350 node->left = lower_subexp (&err, preg, node->left);
1351 if (node->left)
1352 node->left->parent = node;
1353 }
1354 if (node->right && node->right->token.type == SUBEXP)
1355 {
1356 node->right = lower_subexp (&err, preg, node->right);
1357 if (node->right)
1358 node->right->parent = node;
1359 }
1360
1361 return err;
1362}
1363
1364static bin_tree_t *
1365lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1366{
005de2e8 1367 re_dfa_t *dfa = preg->buffer;
eb4a14ed
LC
1368 bin_tree_t *body = node->left;
1369 bin_tree_t *op, *cls, *tree1, *tree;
1370
1371 if (preg->no_sub
1372 /* We do not optimize empty subexpressions, because otherwise we may
1373 have bad CONCAT nodes with NULL children. This is obviously not
1374 very common, so we do not lose much. An example that triggers
1375 this case is the sed "script" /\(\)/x. */
1376 && node->left != NULL
1377 && (node->token.opr.idx >= BITSET_WORD_BITS
1378 || !(dfa->used_bkref_map
1379 & ((bitset_word_t) 1 << node->token.opr.idx))))
1380 return node->left;
1381
1382 /* Convert the SUBEXP node to the concatenation of an
1383 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1384 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1385 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1386 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1387 tree = create_tree (dfa, op, tree1, CONCAT);
1388 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1389 {
1390 *err = REG_ESPACE;
1391 return NULL;
1392 }
1393
1394 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1395 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1396 return tree;
1397}
1398
1399/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1400 nodes. Requires a postorder visit. */
1401static reg_errcode_t
1402calc_first (void *extra, bin_tree_t *node)
1403{
1404 re_dfa_t *dfa = (re_dfa_t *) extra;
1405 if (node->token.type == CONCAT)
1406 {
1407 node->first = node->left->first;
1408 node->node_idx = node->left->node_idx;
1409 }
1410 else
1411 {
1412 node->first = node;
1413 node->node_idx = re_dfa_add_node (dfa, node->token);
1414 if (BE (node->node_idx == REG_MISSING, 0))
1415 return REG_ESPACE;
1416 if (node->token.type == ANCHOR)
1417 dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1418 }
1419 return REG_NOERROR;
1420}
1421
1422/* Pass 2: compute NEXT on the tree. Preorder visit. */
1423static reg_errcode_t
1424calc_next (void *extra, bin_tree_t *node)
1425{
1426 switch (node->token.type)
1427 {
1428 case OP_DUP_ASTERISK:
1429 node->left->next = node;
1430 break;
1431 case CONCAT:
1432 node->left->next = node->right->first;
1433 node->right->next = node->next;
1434 break;
1435 default:
1436 if (node->left)
1437 node->left->next = node->next;
1438 if (node->right)
1439 node->right->next = node->next;
1440 break;
1441 }
1442 return REG_NOERROR;
1443}
1444
1445/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1446static reg_errcode_t
1447link_nfa_nodes (void *extra, bin_tree_t *node)
1448{
1449 re_dfa_t *dfa = (re_dfa_t *) extra;
1450 Idx idx = node->node_idx;
1451 reg_errcode_t err = REG_NOERROR;
1452
1453 switch (node->token.type)
1454 {
1455 case CONCAT:
1456 break;
1457
1458 case END_OF_RE:
1459 assert (node->next == NULL);
1460 break;
1461
1462 case OP_DUP_ASTERISK:
1463 case OP_ALT:
1464 {
1465 Idx left, right;
1466 dfa->has_plural_match = 1;
1467 if (node->left != NULL)
1468 left = node->left->first->node_idx;
1469 else
1470 left = node->next->node_idx;
1471 if (node->right != NULL)
1472 right = node->right->first->node_idx;
1473 else
1474 right = node->next->node_idx;
1475 assert (REG_VALID_INDEX (left));
1476 assert (REG_VALID_INDEX (right));
1477 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1478 }
1479 break;
1480
1481 case ANCHOR:
1482 case OP_OPEN_SUBEXP:
1483 case OP_CLOSE_SUBEXP:
1484 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1485 break;
1486
1487 case OP_BACK_REF:
1488 dfa->nexts[idx] = node->next->node_idx;
1489 if (node->token.type == OP_BACK_REF)
1490 err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1491 break;
1492
1493 default:
1494 assert (!IS_EPSILON_NODE (node->token.type));
1495 dfa->nexts[idx] = node->next->node_idx;
1496 break;
1497 }
1498
1499 return err;
1500}
1501
1502/* Duplicate the epsilon closure of the node ROOT_NODE.
1503 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1504 to their own constraint. */
1505
1506static reg_errcode_t
1507internal_function
1508duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1509 Idx root_node, unsigned int init_constraint)
1510{
1511 Idx org_node, clone_node;
1512 bool ok;
1513 unsigned int constraint = init_constraint;
1514 for (org_node = top_org_node, clone_node = top_clone_node;;)
1515 {
1516 Idx org_dest, clone_dest;
1517 if (dfa->nodes[org_node].type == OP_BACK_REF)
1518 {
1519 /* If the back reference epsilon-transit, its destination must
1520 also have the constraint. Then duplicate the epsilon closure
1521 of the destination of the back reference, and store it in
1522 edests of the back reference. */
1523 org_dest = dfa->nexts[org_node];
1524 re_node_set_empty (dfa->edests + clone_node);
1525 clone_dest = duplicate_node (dfa, org_dest, constraint);
1526 if (BE (clone_dest == REG_MISSING, 0))
1527 return REG_ESPACE;
1528 dfa->nexts[clone_node] = dfa->nexts[org_node];
1529 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1530 if (BE (! ok, 0))
1531 return REG_ESPACE;
1532 }
1533 else if (dfa->edests[org_node].nelem == 0)
1534 {
1535 /* In case of the node can't epsilon-transit, don't duplicate the
1536 destination and store the original destination as the
1537 destination of the node. */
1538 dfa->nexts[clone_node] = dfa->nexts[org_node];
1539 break;
1540 }
1541 else if (dfa->edests[org_node].nelem == 1)
1542 {
1543 /* In case of the node can epsilon-transit, and it has only one
1544 destination. */
1545 org_dest = dfa->edests[org_node].elems[0];
1546 re_node_set_empty (dfa->edests + clone_node);
1547 /* If the node is root_node itself, it means the epsilon closure
1548 has a loop. Then tie it to the destination of the root_node. */
1549 if (org_node == root_node && clone_node != org_node)
1550 {
1551 ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1552 if (BE (! ok, 0))
1553 return REG_ESPACE;
1554 break;
1555 }
1556 /* In case the node has another constraint, append it. */
1557 constraint |= dfa->nodes[org_node].constraint;
1558 clone_dest = duplicate_node (dfa, org_dest, constraint);
1559 if (BE (clone_dest == REG_MISSING, 0))
1560 return REG_ESPACE;
1561 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1562 if (BE (! ok, 0))
1563 return REG_ESPACE;
1564 }
1565 else /* dfa->edests[org_node].nelem == 2 */
1566 {
1567 /* In case of the node can epsilon-transit, and it has two
1568 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1569 org_dest = dfa->edests[org_node].elems[0];
1570 re_node_set_empty (dfa->edests + clone_node);
1571 /* Search for a duplicated node which satisfies the constraint. */
1572 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1573 if (clone_dest == REG_MISSING)
1574 {
1575 /* There is no such duplicated node, create a new one. */
1576 reg_errcode_t err;
1577 clone_dest = duplicate_node (dfa, org_dest, constraint);
1578 if (BE (clone_dest == REG_MISSING, 0))
1579 return REG_ESPACE;
1580 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1581 if (BE (! ok, 0))
1582 return REG_ESPACE;
1583 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1584 root_node, constraint);
1585 if (BE (err != REG_NOERROR, 0))
1586 return err;
1587 }
1588 else
1589 {
1590 /* There is a duplicated node which satisfies the constraint,
1591 use it to avoid infinite loop. */
1592 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1593 if (BE (! ok, 0))
1594 return REG_ESPACE;
1595 }
1596
1597 org_dest = dfa->edests[org_node].elems[1];
1598 clone_dest = duplicate_node (dfa, org_dest, constraint);
1599 if (BE (clone_dest == REG_MISSING, 0))
1600 return REG_ESPACE;
1601 ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1602 if (BE (! ok, 0))
1603 return REG_ESPACE;
1604 }
1605 org_node = org_dest;
1606 clone_node = clone_dest;
1607 }
1608 return REG_NOERROR;
1609}
1610
1611/* Search for a node which is duplicated from the node ORG_NODE, and
1612 satisfies the constraint CONSTRAINT. */
1613
1614static Idx
1615search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1616 unsigned int constraint)
1617{
1618 Idx idx;
1619 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1620 {
1621 if (org_node == dfa->org_indices[idx]
1622 && constraint == dfa->nodes[idx].constraint)
1623 return idx; /* Found. */
1624 }
1625 return REG_MISSING; /* Not found. */
1626}
1627
1628/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1629 Return the index of the new node, or REG_MISSING if insufficient storage is
1630 available. */
1631
1632static Idx
1633duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1634{
1635 Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1636 if (BE (dup_idx != REG_MISSING, 1))
1637 {
1638 dfa->nodes[dup_idx].constraint = constraint;
1639 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1640 dfa->nodes[dup_idx].duplicated = 1;
1641
1642 /* Store the index of the original node. */
1643 dfa->org_indices[dup_idx] = org_idx;
1644 }
1645 return dup_idx;
1646}
1647
1648static reg_errcode_t
1649calc_inveclosure (re_dfa_t *dfa)
1650{
1651 Idx src, idx;
1652 bool ok;
1653 for (idx = 0; idx < dfa->nodes_len; ++idx)
1654 re_node_set_init_empty (dfa->inveclosures + idx);
1655
1656 for (src = 0; src < dfa->nodes_len; ++src)
1657 {
1658 Idx *elems = dfa->eclosures[src].elems;
1659 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1660 {
1661 ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1662 if (BE (! ok, 0))
1663 return REG_ESPACE;
1664 }
1665 }
1666
1667 return REG_NOERROR;
1668}
1669
1670/* Calculate "eclosure" for all the node in DFA. */
1671
1672static reg_errcode_t
1673calc_eclosure (re_dfa_t *dfa)
1674{
1675 Idx node_idx;
1676 bool incomplete;
1677#ifdef DEBUG
1678 assert (dfa->nodes_len > 0);
1679#endif
1680 incomplete = false;
1681 /* For each nodes, calculate epsilon closure. */
1682 for (node_idx = 0; ; ++node_idx)
1683 {
1684 reg_errcode_t err;
1685 re_node_set eclosure_elem;
1686 if (node_idx == dfa->nodes_len)
1687 {
1688 if (!incomplete)
1689 break;
1690 incomplete = false;
1691 node_idx = 0;
1692 }
1693
1694#ifdef DEBUG
1695 assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1696#endif
1697
1698 /* If we have already calculated, skip it. */
1699 if (dfa->eclosures[node_idx].nelem != 0)
1700 continue;
1701 /* Calculate epsilon closure of 'node_idx'. */
1702 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1703 if (BE (err != REG_NOERROR, 0))
1704 return err;
1705
1706 if (dfa->eclosures[node_idx].nelem == 0)
1707 {
1708 incomplete = true;
1709 re_node_set_free (&eclosure_elem);
1710 }
1711 }
1712 return REG_NOERROR;
1713}
1714
1715/* Calculate epsilon closure of NODE. */
1716
1717static reg_errcode_t
1718calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1719{
1720 reg_errcode_t err;
1721 Idx i;
1722 re_node_set eclosure;
1723 bool ok;
1724 bool incomplete = false;
1725 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1726 if (BE (err != REG_NOERROR, 0))
1727 return err;
1728
1729 /* This indicates that we are calculating this node now.
1730 We reference this value to avoid infinite loop. */
1731 dfa->eclosures[node].nelem = REG_MISSING;
1732
1733 /* If the current node has constraints, duplicate all nodes
1734 since they must inherit the constraints. */
1735 if (dfa->nodes[node].constraint
1736 && dfa->edests[node].nelem
1737 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1738 {
1739 err = duplicate_node_closure (dfa, node, node, node,
1740 dfa->nodes[node].constraint);
1741 if (BE (err != REG_NOERROR, 0))
1742 return err;
1743 }
1744
1745 /* Expand each epsilon destination nodes. */
1746 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1747 for (i = 0; i < dfa->edests[node].nelem; ++i)
1748 {
1749 re_node_set eclosure_elem;
1750 Idx edest = dfa->edests[node].elems[i];
1751 /* If calculating the epsilon closure of 'edest' is in progress,
1752 return intermediate result. */
1753 if (dfa->eclosures[edest].nelem == REG_MISSING)
1754 {
1755 incomplete = true;
1756 continue;
1757 }
1758 /* If we haven't calculated the epsilon closure of 'edest' yet,
1759 calculate now. Otherwise use calculated epsilon closure. */
1760 if (dfa->eclosures[edest].nelem == 0)
1761 {
1762 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1763 if (BE (err != REG_NOERROR, 0))
1764 return err;
1765 }
1766 else
1767 eclosure_elem = dfa->eclosures[edest];
1768 /* Merge the epsilon closure of 'edest'. */
1769 err = re_node_set_merge (&eclosure, &eclosure_elem);
1770 if (BE (err != REG_NOERROR, 0))
1771 return err;
1772 /* If the epsilon closure of 'edest' is incomplete,
1773 the epsilon closure of this node is also incomplete. */
1774 if (dfa->eclosures[edest].nelem == 0)
1775 {
1776 incomplete = true;
1777 re_node_set_free (&eclosure_elem);
1778 }
1779 }
1780
1781 /* An epsilon closure includes itself. */
1782 ok = re_node_set_insert (&eclosure, node);
1783 if (BE (! ok, 0))
1784 return REG_ESPACE;
1785 if (incomplete && !root)
1786 dfa->eclosures[node].nelem = 0;
1787 else
1788 dfa->eclosures[node] = eclosure;
1789 *new_set = eclosure;
1790 return REG_NOERROR;
1791}
1792\f
1793/* Functions for token which are used in the parser. */
1794
1795/* Fetch a token from INPUT.
1796 We must not use this function inside bracket expressions. */
1797
1798static void
1799internal_function
1800fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1801{
1802 re_string_skip_bytes (input, peek_token (result, input, syntax));
1803}
1804
1805/* Peek a token from INPUT, and return the length of the token.
1806 We must not use this function inside bracket expressions. */
1807
1808static int
1809internal_function
1810peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1811{
1812 unsigned char c;
1813
1814 if (re_string_eoi (input))
1815 {
1816 token->type = END_OF_RE;
1817 return 0;
1818 }
1819
1820 c = re_string_peek_byte (input, 0);
1821 token->opr.c = c;
1822
1823 token->word_char = 0;
1824#ifdef RE_ENABLE_I18N
1825 token->mb_partial = 0;
1826 if (input->mb_cur_max > 1 &&
1827 !re_string_first_byte (input, re_string_cur_idx (input)))
1828 {
1829 token->type = CHARACTER;
1830 token->mb_partial = 1;
1831 return 1;
1832 }
1833#endif
1834 if (c == '\\')
1835 {
1836 unsigned char c2;
1837 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1838 {
1839 token->type = BACK_SLASH;
1840 return 1;
1841 }
1842
1843 c2 = re_string_peek_byte_case (input, 1);
1844 token->opr.c = c2;
1845 token->type = CHARACTER;
1846#ifdef RE_ENABLE_I18N
1847 if (input->mb_cur_max > 1)
1848 {
1849 wint_t wc = re_string_wchar_at (input,
1850 re_string_cur_idx (input) + 1);
1851 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1852 }
1853 else
1854#endif
1855 token->word_char = IS_WORD_CHAR (c2) != 0;
1856
1857 switch (c2)
1858 {
1859 case '|':
1860 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1861 token->type = OP_ALT;
1862 break;
1863 case '1': case '2': case '3': case '4': case '5':
1864 case '6': case '7': case '8': case '9':
1865 if (!(syntax & RE_NO_BK_REFS))
1866 {
1867 token->type = OP_BACK_REF;
1868 token->opr.idx = c2 - '1';
1869 }
1870 break;
1871 case '<':
1872 if (!(syntax & RE_NO_GNU_OPS))
1873 {
1874 token->type = ANCHOR;
1875 token->opr.ctx_type = WORD_FIRST;
1876 }
1877 break;
1878 case '>':
1879 if (!(syntax & RE_NO_GNU_OPS))
1880 {
1881 token->type = ANCHOR;
1882 token->opr.ctx_type = WORD_LAST;
1883 }
1884 break;
1885 case 'b':
1886 if (!(syntax & RE_NO_GNU_OPS))
1887 {
1888 token->type = ANCHOR;
1889 token->opr.ctx_type = WORD_DELIM;
1890 }
1891 break;
1892 case 'B':
1893 if (!(syntax & RE_NO_GNU_OPS))
1894 {
1895 token->type = ANCHOR;
1896 token->opr.ctx_type = NOT_WORD_DELIM;
1897 }
1898 break;
1899 case 'w':
1900 if (!(syntax & RE_NO_GNU_OPS))
1901 token->type = OP_WORD;
1902 break;
1903 case 'W':
1904 if (!(syntax & RE_NO_GNU_OPS))
1905 token->type = OP_NOTWORD;
1906 break;
1907 case 's':
1908 if (!(syntax & RE_NO_GNU_OPS))
1909 token->type = OP_SPACE;
1910 break;
1911 case 'S':
1912 if (!(syntax & RE_NO_GNU_OPS))
1913 token->type = OP_NOTSPACE;
1914 break;
1915 case '`':
1916 if (!(syntax & RE_NO_GNU_OPS))
1917 {
1918 token->type = ANCHOR;
1919 token->opr.ctx_type = BUF_FIRST;
1920 }
1921 break;
1922 case '\'':
1923 if (!(syntax & RE_NO_GNU_OPS))
1924 {
1925 token->type = ANCHOR;
1926 token->opr.ctx_type = BUF_LAST;
1927 }
1928 break;
1929 case '(':
1930 if (!(syntax & RE_NO_BK_PARENS))
1931 token->type = OP_OPEN_SUBEXP;
1932 break;
1933 case ')':
1934 if (!(syntax & RE_NO_BK_PARENS))
1935 token->type = OP_CLOSE_SUBEXP;
1936 break;
1937 case '+':
1938 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1939 token->type = OP_DUP_PLUS;
1940 break;
1941 case '?':
1942 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1943 token->type = OP_DUP_QUESTION;
1944 break;
1945 case '{':
1946 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1947 token->type = OP_OPEN_DUP_NUM;
1948 break;
1949 case '}':
1950 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1951 token->type = OP_CLOSE_DUP_NUM;
1952 break;
1953 default:
1954 break;
1955 }
1956 return 2;
1957 }
1958
1959 token->type = CHARACTER;
1960#ifdef RE_ENABLE_I18N
1961 if (input->mb_cur_max > 1)
1962 {
1963 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1964 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1965 }
1966 else
1967#endif
1968 token->word_char = IS_WORD_CHAR (token->opr.c);
1969
1970 switch (c)
1971 {
1972 case '\n':
1973 if (syntax & RE_NEWLINE_ALT)
1974 token->type = OP_ALT;
1975 break;
1976 case '|':
1977 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1978 token->type = OP_ALT;
1979 break;
1980 case '*':
1981 token->type = OP_DUP_ASTERISK;
1982 break;
1983 case '+':
1984 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1985 token->type = OP_DUP_PLUS;
1986 break;
1987 case '?':
1988 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1989 token->type = OP_DUP_QUESTION;
1990 break;
1991 case '{':
1992 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1993 token->type = OP_OPEN_DUP_NUM;
1994 break;
1995 case '}':
1996 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1997 token->type = OP_CLOSE_DUP_NUM;
1998 break;
1999 case '(':
2000 if (syntax & RE_NO_BK_PARENS)
2001 token->type = OP_OPEN_SUBEXP;
2002 break;
2003 case ')':
2004 if (syntax & RE_NO_BK_PARENS)
2005 token->type = OP_CLOSE_SUBEXP;
2006 break;
2007 case '[':
2008 token->type = OP_OPEN_BRACKET;
2009 break;
2010 case '.':
2011 token->type = OP_PERIOD;
2012 break;
2013 case '^':
2014 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
2015 re_string_cur_idx (input) != 0)
2016 {
2017 char prev = re_string_peek_byte (input, -1);
2018 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
2019 break;
2020 }
2021 token->type = ANCHOR;
2022 token->opr.ctx_type = LINE_FIRST;
2023 break;
2024 case '$':
2025 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2026 re_string_cur_idx (input) + 1 != re_string_length (input))
2027 {
2028 re_token_t next;
2029 re_string_skip_bytes (input, 1);
2030 peek_token (&next, input, syntax);
2031 re_string_skip_bytes (input, -1);
2032 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2033 break;
2034 }
2035 token->type = ANCHOR;
2036 token->opr.ctx_type = LINE_LAST;
2037 break;
2038 default:
2039 break;
2040 }
2041 return 1;
2042}
2043
2044/* Peek a token from INPUT, and return the length of the token.
2045 We must not use this function out of bracket expressions. */
2046
2047static int
2048internal_function
2049peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2050{
2051 unsigned char c;
2052 if (re_string_eoi (input))
2053 {
2054 token->type = END_OF_RE;
2055 return 0;
2056 }
2057 c = re_string_peek_byte (input, 0);
2058 token->opr.c = c;
2059
2060#ifdef RE_ENABLE_I18N
2061 if (input->mb_cur_max > 1 &&
2062 !re_string_first_byte (input, re_string_cur_idx (input)))
2063 {
2064 token->type = CHARACTER;
2065 return 1;
2066 }
2067#endif /* RE_ENABLE_I18N */
2068
2069 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2070 && re_string_cur_idx (input) + 1 < re_string_length (input))
2071 {
2072 /* In this case, '\' escape a character. */
2073 unsigned char c2;
2074 re_string_skip_bytes (input, 1);
2075 c2 = re_string_peek_byte (input, 0);
2076 token->opr.c = c2;
2077 token->type = CHARACTER;
2078 return 1;
2079 }
2080 if (c == '[') /* '[' is a special char in a bracket exps. */
2081 {
2082 unsigned char c2;
2083 int token_len;
2084 if (re_string_cur_idx (input) + 1 < re_string_length (input))
2085 c2 = re_string_peek_byte (input, 1);
2086 else
2087 c2 = 0;
2088 token->opr.c = c2;
2089 token_len = 2;
2090 switch (c2)
2091 {
2092 case '.':
2093 token->type = OP_OPEN_COLL_ELEM;
2094 break;
2095 case '=':
2096 token->type = OP_OPEN_EQUIV_CLASS;
2097 break;
2098 case ':':
2099 if (syntax & RE_CHAR_CLASSES)
2100 {
2101 token->type = OP_OPEN_CHAR_CLASS;
2102 break;
2103 }
2104 /* else fall through. */
2105 default:
2106 token->type = CHARACTER;
2107 token->opr.c = c;
2108 token_len = 1;
2109 break;
2110 }
2111 return token_len;
2112 }
2113 switch (c)
2114 {
2115 case '-':
2116 token->type = OP_CHARSET_RANGE;
2117 break;
2118 case ']':
2119 token->type = OP_CLOSE_BRACKET;
2120 break;
2121 case '^':
2122 token->type = OP_NON_MATCH_LIST;
2123 break;
2124 default:
2125 token->type = CHARACTER;
2126 }
2127 return 1;
2128}
2129\f
2130/* Functions for parser. */
2131
2132/* Entry point of the parser.
2133 Parse the regular expression REGEXP and return the structure tree.
005de2e8 2134 If an error occurs, ERR is set by error code, and return NULL.
eb4a14ed
LC
2135 This function build the following tree, from regular expression <reg_exp>:
2136 CAT
2137 / \
2138 / \
2139 <reg_exp> EOR
2140
2141 CAT means concatenation.
2142 EOR means end of regular expression. */
2143
2144static bin_tree_t *
2145parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2146 reg_errcode_t *err)
2147{
005de2e8 2148 re_dfa_t *dfa = preg->buffer;
eb4a14ed
LC
2149 bin_tree_t *tree, *eor, *root;
2150 re_token_t current_token;
2151 dfa->syntax = syntax;
2152 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2153 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2154 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2155 return NULL;
2156 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2157 if (tree != NULL)
2158 root = create_tree (dfa, tree, eor, CONCAT);
2159 else
2160 root = eor;
2161 if (BE (eor == NULL || root == NULL, 0))
2162 {
2163 *err = REG_ESPACE;
2164 return NULL;
2165 }
2166 return root;
2167}
2168
2169/* This function build the following tree, from regular expression
2170 <branch1>|<branch2>:
2171 ALT
2172 / \
2173 / \
2174 <branch1> <branch2>
2175
2176 ALT means alternative, which represents the operator '|'. */
2177
2178static bin_tree_t *
2179parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2180 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2181{
005de2e8 2182 re_dfa_t *dfa = preg->buffer;
eb4a14ed
LC
2183 bin_tree_t *tree, *branch = NULL;
2184 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2185 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2186 return NULL;
2187
2188 while (token->type == OP_ALT)
2189 {
2190 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2191 if (token->type != OP_ALT && token->type != END_OF_RE
2192 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2193 {
2194 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2195 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2196 return NULL;
2197 }
2198 else
2199 branch = NULL;
2200 tree = create_tree (dfa, tree, branch, OP_ALT);
2201 if (BE (tree == NULL, 0))
2202 {
2203 *err = REG_ESPACE;
2204 return NULL;
2205 }
2206 }
2207 return tree;
2208}
2209
2210/* This function build the following tree, from regular expression
2211 <exp1><exp2>:
2212 CAT
2213 / \
2214 / \
2215 <exp1> <exp2>
2216
2217 CAT means concatenation. */
2218
2219static bin_tree_t *
2220parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2221 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2222{
2223 bin_tree_t *tree, *expr;
005de2e8 2224 re_dfa_t *dfa = preg->buffer;
eb4a14ed
LC
2225 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2226 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2227 return NULL;
2228
2229 while (token->type != OP_ALT && token->type != END_OF_RE
2230 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2231 {
2232 expr = parse_expression (regexp, preg, token, syntax, nest, err);
2233 if (BE (*err != REG_NOERROR && expr == NULL, 0))
2234 {
005de2e8
LC
2235 if (tree != NULL)
2236 postorder (tree, free_tree, NULL);
eb4a14ed
LC
2237 return NULL;
2238 }
2239 if (tree != NULL && expr != NULL)
2240 {
005de2e8
LC
2241 bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
2242 if (newtree == NULL)
eb4a14ed 2243 {
005de2e8
LC
2244 postorder (expr, free_tree, NULL);
2245 postorder (tree, free_tree, NULL);
eb4a14ed
LC
2246 *err = REG_ESPACE;
2247 return NULL;
2248 }
005de2e8 2249 tree = newtree;
eb4a14ed
LC
2250 }
2251 else if (tree == NULL)
2252 tree = expr;
2253 /* Otherwise expr == NULL, we don't need to create new tree. */
2254 }
2255 return tree;
2256}
2257
2258/* This function build the following tree, from regular expression a*:
2259 *
2260 |
2261 a
2262*/
2263
2264static bin_tree_t *
2265parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2266 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2267{
005de2e8 2268 re_dfa_t *dfa = preg->buffer;
eb4a14ed
LC
2269 bin_tree_t *tree;
2270 switch (token->type)
2271 {
2272 case CHARACTER:
2273 tree = create_token_tree (dfa, NULL, NULL, token);
2274 if (BE (tree == NULL, 0))
2275 {
2276 *err = REG_ESPACE;
2277 return NULL;
2278 }
2279#ifdef RE_ENABLE_I18N
2280 if (dfa->mb_cur_max > 1)
2281 {
2282 while (!re_string_eoi (regexp)
2283 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2284 {
2285 bin_tree_t *mbc_remain;
2286 fetch_token (token, regexp, syntax);
2287 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2288 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2289 if (BE (mbc_remain == NULL || tree == NULL, 0))
2290 {
2291 *err = REG_ESPACE;
2292 return NULL;
2293 }
2294 }
2295 }
2296#endif
2297 break;
2298 case OP_OPEN_SUBEXP:
2299 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2300 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2301 return NULL;
2302 break;
2303 case OP_OPEN_BRACKET:
2304 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2305 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2306 return NULL;
2307 break;
2308 case OP_BACK_REF:
2309 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2310 {
2311 *err = REG_ESUBREG;
2312 return NULL;
2313 }
2314 dfa->used_bkref_map |= 1 << token->opr.idx;
2315 tree = create_token_tree (dfa, NULL, NULL, token);
2316 if (BE (tree == NULL, 0))
2317 {
2318 *err = REG_ESPACE;
2319 return NULL;
2320 }
2321 ++dfa->nbackref;
2322 dfa->has_mb_node = 1;
2323 break;
2324 case OP_OPEN_DUP_NUM:
2325 if (syntax & RE_CONTEXT_INVALID_DUP)
2326 {
2327 *err = REG_BADRPT;
2328 return NULL;
2329 }
2330 /* FALLTHROUGH */
2331 case OP_DUP_ASTERISK:
2332 case OP_DUP_PLUS:
2333 case OP_DUP_QUESTION:
2334 if (syntax & RE_CONTEXT_INVALID_OPS)
2335 {
2336 *err = REG_BADRPT;
2337 return NULL;
2338 }
2339 else if (syntax & RE_CONTEXT_INDEP_OPS)
2340 {
2341 fetch_token (token, regexp, syntax);
2342 return parse_expression (regexp, preg, token, syntax, nest, err);
2343 }
2344 /* else fall through */
2345 case OP_CLOSE_SUBEXP:
2346 if ((token->type == OP_CLOSE_SUBEXP) &&
2347 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2348 {
2349 *err = REG_ERPAREN;
2350 return NULL;
2351 }
2352 /* else fall through */
2353 case OP_CLOSE_DUP_NUM:
2354 /* We treat it as a normal character. */
2355
2356 /* Then we can these characters as normal characters. */
2357 token->type = CHARACTER;
2358 /* mb_partial and word_char bits should be initialized already
2359 by peek_token. */
2360 tree = create_token_tree (dfa, NULL, NULL, token);
2361 if (BE (tree == NULL, 0))
2362 {
2363 *err = REG_ESPACE;
2364 return NULL;
2365 }
2366 break;
2367 case ANCHOR:
2368 if ((token->opr.ctx_type
2369 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2370 && dfa->word_ops_used == 0)
2371 init_word_char (dfa);
2372 if (token->opr.ctx_type == WORD_DELIM
2373 || token->opr.ctx_type == NOT_WORD_DELIM)
2374 {
2375 bin_tree_t *tree_first, *tree_last;
2376 if (token->opr.ctx_type == WORD_DELIM)
2377 {
2378 token->opr.ctx_type = WORD_FIRST;
2379 tree_first = create_token_tree (dfa, NULL, NULL, token);
2380 token->opr.ctx_type = WORD_LAST;
2381 }
2382 else
2383 {
2384 token->opr.ctx_type = INSIDE_WORD;
2385 tree_first = create_token_tree (dfa, NULL, NULL, token);
2386 token->opr.ctx_type = INSIDE_NOTWORD;
2387 }
2388 tree_last = create_token_tree (dfa, NULL, NULL, token);
2389 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2390 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2391 {
2392 *err = REG_ESPACE;
2393 return NULL;
2394 }
2395 }
2396 else
2397 {
2398 tree = create_token_tree (dfa, NULL, NULL, token);
2399 if (BE (tree == NULL, 0))
2400 {
2401 *err = REG_ESPACE;
2402 return NULL;
2403 }
2404 }
2405 /* We must return here, since ANCHORs can't be followed
2406 by repetition operators.
2407 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2408 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2409 fetch_token (token, regexp, syntax);
2410 return tree;
2411 case OP_PERIOD:
2412 tree = create_token_tree (dfa, NULL, NULL, token);
2413 if (BE (tree == NULL, 0))
2414 {
2415 *err = REG_ESPACE;
2416 return NULL;
2417 }
2418 if (dfa->mb_cur_max > 1)
2419 dfa->has_mb_node = 1;
2420 break;
2421 case OP_WORD:
2422 case OP_NOTWORD:
2423 tree = build_charclass_op (dfa, regexp->trans,
2424 (const unsigned char *) "alnum",
2425 (const unsigned char *) "_",
2426 token->type == OP_NOTWORD, err);
2427 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2428 return NULL;
2429 break;
2430 case OP_SPACE:
2431 case OP_NOTSPACE:
2432 tree = build_charclass_op (dfa, regexp->trans,
2433 (const unsigned char *) "space",
2434 (const unsigned char *) "",
2435 token->type == OP_NOTSPACE, err);
2436 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2437 return NULL;
2438 break;
2439 case OP_ALT:
2440 case END_OF_RE:
2441 return NULL;
2442 case BACK_SLASH:
2443 *err = REG_EESCAPE;
2444 return NULL;
2445 default:
2446 /* Must not happen? */
2447#ifdef DEBUG
2448 assert (0);
2449#endif
2450 return NULL;
2451 }
2452 fetch_token (token, regexp, syntax);
2453
2454 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2455 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2456 {
2457 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2458 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2459 return NULL;
2460 /* In BRE consecutive duplications are not allowed. */
2461 if ((syntax & RE_CONTEXT_INVALID_DUP)
2462 && (token->type == OP_DUP_ASTERISK
2463 || token->type == OP_OPEN_DUP_NUM))
2464 {
2465 *err = REG_BADRPT;
2466 return NULL;
2467 }
2468 }
2469
2470 return tree;
2471}
2472
2473/* This function build the following tree, from regular expression
2474 (<reg_exp>):
2475 SUBEXP
2476 |
2477 <reg_exp>
2478*/
2479
2480static bin_tree_t *
2481parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2482 reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2483{
005de2e8 2484 re_dfa_t *dfa = preg->buffer;
eb4a14ed
LC
2485 bin_tree_t *tree;
2486 size_t cur_nsub;
2487 cur_nsub = preg->re_nsub++;
2488
2489 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2490
2491 /* The subexpression may be a null string. */
2492 if (token->type == OP_CLOSE_SUBEXP)
2493 tree = NULL;
2494 else
2495 {
2496 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2497 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
005de2e8
LC
2498 {
2499 if (tree != NULL)
2500 postorder (tree, free_tree, NULL);
2501 *err = REG_EPAREN;
2502 }
eb4a14ed
LC
2503 if (BE (*err != REG_NOERROR, 0))
2504 return NULL;
2505 }
2506
2507 if (cur_nsub <= '9' - '1')
2508 dfa->completed_bkref_map |= 1 << cur_nsub;
2509
2510 tree = create_tree (dfa, tree, NULL, SUBEXP);
2511 if (BE (tree == NULL, 0))
2512 {
2513 *err = REG_ESPACE;
2514 return NULL;
2515 }
2516 tree->token.opr.idx = cur_nsub;
2517 return tree;
2518}
2519
2520/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2521
2522static bin_tree_t *
2523parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2524 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2525{
2526 bin_tree_t *tree = NULL, *old_tree = NULL;
2527 Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2528 re_token_t start_token = *token;
2529
2530 if (token->type == OP_OPEN_DUP_NUM)
2531 {
2532 end = 0;
2533 start = fetch_number (regexp, token, syntax);
2534 if (start == REG_MISSING)
2535 {
2536 if (token->type == CHARACTER && token->opr.c == ',')
2537 start = 0; /* We treat "{,m}" as "{0,m}". */
2538 else
2539 {
2540 *err = REG_BADBR; /* <re>{} is invalid. */
2541 return NULL;
2542 }
2543 }
2544 if (BE (start != REG_ERROR, 1))
2545 {
2546 /* We treat "{n}" as "{n,n}". */
2547 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2548 : ((token->type == CHARACTER && token->opr.c == ',')
2549 ? fetch_number (regexp, token, syntax) : REG_ERROR));
2550 }
2551 if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2552 {
2553 /* Invalid sequence. */
2554 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2555 {
2556 if (token->type == END_OF_RE)
2557 *err = REG_EBRACE;
2558 else
2559 *err = REG_BADBR;
2560
2561 return NULL;
2562 }
2563
2564 /* If the syntax bit is set, rollback. */
2565 re_string_set_index (regexp, start_idx);
2566 *token = start_token;
2567 token->type = CHARACTER;
2568 /* mb_partial and word_char bits should be already initialized by
2569 peek_token. */
2570 return elem;
2571 }
2572
2573 if (BE ((end != REG_MISSING && start > end)
2574 || token->type != OP_CLOSE_DUP_NUM, 0))
2575 {
2576 /* First number greater than second. */
2577 *err = REG_BADBR;
2578 return NULL;
2579 }
005de2e8
LC
2580
2581 if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
2582 {
2583 *err = REG_ESIZE;
2584 return NULL;
2585 }
eb4a14ed
LC
2586 }
2587 else
2588 {
2589 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2590 end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2591 }
2592
2593 fetch_token (token, regexp, syntax);
2594
2595 if (BE (elem == NULL, 0))
2596 return NULL;
2597 if (BE (start == 0 && end == 0, 0))
2598 {
2599 postorder (elem, free_tree, NULL);
2600 return NULL;
2601 }
2602
2603 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2604 if (BE (start > 0, 0))
2605 {
2606 tree = elem;
2607 for (i = 2; i <= start; ++i)
2608 {
2609 elem = duplicate_tree (elem, dfa);
2610 tree = create_tree (dfa, tree, elem, CONCAT);
2611 if (BE (elem == NULL || tree == NULL, 0))
2612 goto parse_dup_op_espace;
2613 }
2614
2615 if (start == end)
2616 return tree;
2617
2618 /* Duplicate ELEM before it is marked optional. */
2619 elem = duplicate_tree (elem, dfa);
2620 old_tree = tree;
2621 }
2622 else
2623 old_tree = NULL;
2624
2625 if (elem->token.type == SUBEXP)
005de2e8
LC
2626 {
2627 uintptr_t subidx = elem->token.opr.idx;
2628 postorder (elem, mark_opt_subexp, (void *) subidx);
2629 }
eb4a14ed
LC
2630
2631 tree = create_tree (dfa, elem, NULL,
2632 (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2633 if (BE (tree == NULL, 0))
2634 goto parse_dup_op_espace;
2635
2636/* From gnulib's "intprops.h":
2637 True if the arithmetic type T is signed. */
2638#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
2639
2640 /* This loop is actually executed only when end != REG_MISSING,
2641 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2642 already created the start+1-th copy. */
2643 if (TYPE_SIGNED (Idx) || end != REG_MISSING)
2644 for (i = start + 2; i <= end; ++i)
2645 {
2646 elem = duplicate_tree (elem, dfa);
2647 tree = create_tree (dfa, tree, elem, CONCAT);
2648 if (BE (elem == NULL || tree == NULL, 0))
2649 goto parse_dup_op_espace;
2650
2651 tree = create_tree (dfa, tree, NULL, OP_ALT);
2652 if (BE (tree == NULL, 0))
2653 goto parse_dup_op_espace;
2654 }
2655
2656 if (old_tree)
2657 tree = create_tree (dfa, old_tree, tree, CONCAT);
2658
2659 return tree;
2660
2661 parse_dup_op_espace:
2662 *err = REG_ESPACE;
2663 return NULL;
2664}
2665
2666/* Size of the names for collating symbol/equivalence_class/character_class.
2667 I'm not sure, but maybe enough. */
2668#define BRACKET_NAME_BUF_SIZE 32
2669
2670#ifndef _LIBC
2671 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2672 Build the range expression which starts from START_ELEM, and ends
2673 at END_ELEM. The result are written to MBCSET and SBCSET.
2674 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
005de2e8 2675 mbcset->range_ends, is a pointer argument since we may
eb4a14ed
LC
2676 update it. */
2677
2678static reg_errcode_t
2679internal_function
2680# ifdef RE_ENABLE_I18N
2681build_range_exp (const reg_syntax_t syntax,
2682 bitset_t sbcset,
2683 re_charset_t *mbcset,
2684 Idx *range_alloc,
2685 const bracket_elem_t *start_elem,
2686 const bracket_elem_t *end_elem)
2687# else /* not RE_ENABLE_I18N */
2688build_range_exp (const reg_syntax_t syntax,
2689 bitset_t sbcset,
2690 const bracket_elem_t *start_elem,
2691 const bracket_elem_t *end_elem)
2692# endif /* not RE_ENABLE_I18N */
2693{
2694 unsigned int start_ch, end_ch;
2695 /* Equivalence Classes and Character Classes can't be a range start/end. */
2696 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2697 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2698 0))
2699 return REG_ERANGE;
2700
2701 /* We can handle no multi character collating elements without libc
2702 support. */
2703 if (BE ((start_elem->type == COLL_SYM
2704 && strlen ((char *) start_elem->opr.name) > 1)
2705 || (end_elem->type == COLL_SYM
2706 && strlen ((char *) end_elem->opr.name) > 1), 0))
2707 return REG_ECOLLATE;
2708
2709# ifdef RE_ENABLE_I18N
2710 {
2711 wchar_t wc;
2712 wint_t start_wc;
2713 wint_t end_wc;
2714 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2715
2716 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2717 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2718 : 0));
2719 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2720 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2721 : 0));
2722 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2723 ? __btowc (start_ch) : start_elem->opr.wch);
2724 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2725 ? __btowc (end_ch) : end_elem->opr.wch);
2726 if (start_wc == WEOF || end_wc == WEOF)
2727 return REG_ECOLLATE;
2728 cmp_buf[0] = start_wc;
2729 cmp_buf[4] = end_wc;
2730
2731 if (BE ((syntax & RE_NO_EMPTY_RANGES)
2732 && wcscoll (cmp_buf, cmp_buf + 4) > 0, 0))
2733 return REG_ERANGE;
2734
2735 /* Got valid collation sequence values, add them as a new entry.
2736 However, for !_LIBC we have no collation elements: if the
2737 character set is single byte, the single byte character set
2738 that we build below suffices. parse_bracket_exp passes
2739 no MBCSET if dfa->mb_cur_max == 1. */
2740 if (mbcset)
2741 {
2742 /* Check the space of the arrays. */
2743 if (BE (*range_alloc == mbcset->nranges, 0))
2744 {
2745 /* There is not enough space, need realloc. */
2746 wchar_t *new_array_start, *new_array_end;
2747 Idx new_nranges;
2748
2749 /* +1 in case of mbcset->nranges is 0. */
2750 new_nranges = 2 * mbcset->nranges + 1;
2751 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2752 are NULL if *range_alloc == 0. */
2753 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2754 new_nranges);
2755 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2756 new_nranges);
2757
2758 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2759 return REG_ESPACE;
2760
2761 mbcset->range_starts = new_array_start;
2762 mbcset->range_ends = new_array_end;
2763 *range_alloc = new_nranges;
2764 }
2765
2766 mbcset->range_starts[mbcset->nranges] = start_wc;
2767 mbcset->range_ends[mbcset->nranges++] = end_wc;
2768 }
2769
2770 /* Build the table for single byte characters. */
2771 for (wc = 0; wc < SBC_MAX; ++wc)
2772 {
2773 cmp_buf[2] = wc;
2774 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2775 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2776 bitset_set (sbcset, wc);
2777 }
2778 }
2779# else /* not RE_ENABLE_I18N */
2780 {
2781 unsigned int ch;
2782 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2783 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2784 : 0));
2785 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2786 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2787 : 0));
2788 if (start_ch > end_ch)
2789 return REG_ERANGE;
2790 /* Build the table for single byte characters. */
2791 for (ch = 0; ch < SBC_MAX; ++ch)
2792 if (start_ch <= ch && ch <= end_ch)
2793 bitset_set (sbcset, ch);
2794 }
2795# endif /* not RE_ENABLE_I18N */
2796 return REG_NOERROR;
2797}
2798#endif /* not _LIBC */
2799
2800#ifndef _LIBC
2801/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2802 Build the collating element which is represented by NAME.
2803 The result are written to MBCSET and SBCSET.
2804 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2805 pointer argument since we may update it. */
2806
2807static reg_errcode_t
2808internal_function
eb4a14ed 2809# ifdef RE_ENABLE_I18N
005de2e8
LC
2810build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2811 Idx *coll_sym_alloc, const unsigned char *name)
2812# else /* not RE_ENABLE_I18N */
2813build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2814# endif /* not RE_ENABLE_I18N */
eb4a14ed
LC
2815{
2816 size_t name_len = strlen ((const char *) name);
2817 if (BE (name_len != 1, 0))
2818 return REG_ECOLLATE;
2819 else
2820 {
2821 bitset_set (sbcset, name[0]);
2822 return REG_NOERROR;
2823 }
2824}
2825#endif /* not _LIBC */
2826
2827/* This function parse bracket expression like "[abc]", "[a-c]",
2828 "[[.a-a.]]" etc. */
2829
2830static bin_tree_t *
2831parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2832 reg_syntax_t syntax, reg_errcode_t *err)
2833{
2834#ifdef _LIBC
2835 const unsigned char *collseqmb;
2836 const char *collseqwc;
2837 uint32_t nrules;
2838 int32_t table_size;
2839 const int32_t *symb_table;
2840 const unsigned char *extra;
2841
005de2e8
LC
2842 /* Local function for parse_bracket_exp used in _LIBC environment.
2843 Seek the collating symbol entry corresponding to NAME.
eb4a14ed
LC
2844 Return the index of the symbol in the SYMB_TABLE. */
2845
2846 auto inline int32_t
2847 __attribute ((always_inline))
2848 seek_collating_symbol_entry (name, name_len)
2849 const unsigned char *name;
2850 size_t name_len;
2851 {
2852 int32_t hash = elem_hash ((const char *) name, name_len);
2853 int32_t elem = hash % table_size;
2854 if (symb_table[2 * elem] != 0)
2855 {
2856 int32_t second = hash % (table_size - 2) + 1;
2857
2858 do
2859 {
2860 /* First compare the hashing value. */
2861 if (symb_table[2 * elem] == hash
2862 /* Compare the length of the name. */
2863 && name_len == extra[symb_table[2 * elem + 1]]
2864 /* Compare the name. */
2865 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2866 name_len) == 0)
2867 {
2868 /* Yep, this is the entry. */
2869 break;
2870 }
2871
2872 /* Next entry. */
2873 elem += second;
2874 }
2875 while (symb_table[2 * elem] != 0);
2876 }
2877 return elem;
2878 }
2879
2880 /* Local function for parse_bracket_exp used in _LIBC environment.
2881 Look up the collation sequence value of BR_ELEM.
2882 Return the value if succeeded, UINT_MAX otherwise. */
2883
2884 auto inline unsigned int
2885 __attribute ((always_inline))
2886 lookup_collation_sequence_value (br_elem)
2887 bracket_elem_t *br_elem;
2888 {
2889 if (br_elem->type == SB_CHAR)
2890 {
2891 /*
2892 if (MB_CUR_MAX == 1)
2893 */
2894 if (nrules == 0)
2895 return collseqmb[br_elem->opr.ch];
2896 else
2897 {
2898 wint_t wc = __btowc (br_elem->opr.ch);
2899 return __collseq_table_lookup (collseqwc, wc);
2900 }
2901 }
2902 else if (br_elem->type == MB_CHAR)
2903 {
2904 if (nrules != 0)
2905 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2906 }
2907 else if (br_elem->type == COLL_SYM)
2908 {
2909 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2910 if (nrules != 0)
2911 {
2912 int32_t elem, idx;
2913 elem = seek_collating_symbol_entry (br_elem->opr.name,
2914 sym_name_len);
2915 if (symb_table[2 * elem] != 0)
2916 {
2917 /* We found the entry. */
2918 idx = symb_table[2 * elem + 1];
2919 /* Skip the name of collating element name. */
2920 idx += 1 + extra[idx];
2921 /* Skip the byte sequence of the collating element. */
2922 idx += 1 + extra[idx];
2923 /* Adjust for the alignment. */
2924 idx = (idx + 3) & ~3;
2925 /* Skip the multibyte collation sequence value. */
2926 idx += sizeof (unsigned int);
2927 /* Skip the wide char sequence of the collating element. */
2928 idx += sizeof (unsigned int) *
2929 (1 + *(unsigned int *) (extra + idx));
2930 /* Return the collation sequence value. */
2931 return *(unsigned int *) (extra + idx);
2932 }
2933 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2934 {
2935 /* No valid character. Match it as a single byte
2936 character. */
2937 return collseqmb[br_elem->opr.name[0]];
2938 }
2939 }
2940 else if (sym_name_len == 1)
2941 return collseqmb[br_elem->opr.name[0]];
2942 }
2943 return UINT_MAX;
2944 }
2945
005de2e8 2946 /* Local function for parse_bracket_exp used in _LIBC environment.
eb4a14ed
LC
2947 Build the range expression which starts from START_ELEM, and ends
2948 at END_ELEM. The result are written to MBCSET and SBCSET.
2949 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
005de2e8 2950 mbcset->range_ends, is a pointer argument since we may
eb4a14ed
LC
2951 update it. */
2952
2953 auto inline reg_errcode_t
2954 __attribute ((always_inline))
2955 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2956 re_charset_t *mbcset;
2957 Idx *range_alloc;
2958 bitset_t sbcset;
2959 bracket_elem_t *start_elem, *end_elem;
2960 {
2961 unsigned int ch;
2962 uint32_t start_collseq;
2963 uint32_t end_collseq;
2964
2965 /* Equivalence Classes and Character Classes can't be a range
2966 start/end. */
2967 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2968 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2969 0))
2970 return REG_ERANGE;
2971
2972 start_collseq = lookup_collation_sequence_value (start_elem);
2973 end_collseq = lookup_collation_sequence_value (end_elem);
2974 /* Check start/end collation sequence values. */
2975 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2976 return REG_ECOLLATE;
2977 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2978 return REG_ERANGE;
2979
2980 /* Got valid collation sequence values, add them as a new entry.
2981 However, if we have no collation elements, and the character set
2982 is single byte, the single byte character set that we
2983 build below suffices. */
2984 if (nrules > 0 || dfa->mb_cur_max > 1)
2985 {
2986 /* Check the space of the arrays. */
2987 if (BE (*range_alloc == mbcset->nranges, 0))
2988 {
2989 /* There is not enough space, need realloc. */
2990 uint32_t *new_array_start;
2991 uint32_t *new_array_end;
2992 Idx new_nranges;
2993
2994 /* +1 in case of mbcset->nranges is 0. */
2995 new_nranges = 2 * mbcset->nranges + 1;
2996 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2997 new_nranges);
2998 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2999 new_nranges);
3000
3001 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
3002 return REG_ESPACE;
3003
3004 mbcset->range_starts = new_array_start;
3005 mbcset->range_ends = new_array_end;
3006 *range_alloc = new_nranges;
3007 }
3008
3009 mbcset->range_starts[mbcset->nranges] = start_collseq;
3010 mbcset->range_ends[mbcset->nranges++] = end_collseq;
3011 }
3012
3013 /* Build the table for single byte characters. */
3014 for (ch = 0; ch < SBC_MAX; ch++)
3015 {
3016 uint32_t ch_collseq;
3017 /*
3018 if (MB_CUR_MAX == 1)
3019 */
3020 if (nrules == 0)
3021 ch_collseq = collseqmb[ch];
3022 else
3023 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
3024 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3025 bitset_set (sbcset, ch);
3026 }
3027 return REG_NOERROR;
3028 }
3029
005de2e8 3030 /* Local function for parse_bracket_exp used in _LIBC environment.
eb4a14ed
LC
3031 Build the collating element which is represented by NAME.
3032 The result are written to MBCSET and SBCSET.
3033 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
005de2e8 3034 pointer argument since we may update it. */
eb4a14ed
LC
3035
3036 auto inline reg_errcode_t
3037 __attribute ((always_inline))
3038 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3039 re_charset_t *mbcset;
3040 Idx *coll_sym_alloc;
3041 bitset_t sbcset;
3042 const unsigned char *name;
3043 {
3044 int32_t elem, idx;
3045 size_t name_len = strlen ((const char *) name);
3046 if (nrules != 0)
3047 {
3048 elem = seek_collating_symbol_entry (name, name_len);
3049 if (symb_table[2 * elem] != 0)
3050 {
3051 /* We found the entry. */
3052 idx = symb_table[2 * elem + 1];
3053 /* Skip the name of collating element name. */
3054 idx += 1 + extra[idx];
3055 }
3056 else if (symb_table[2 * elem] == 0 && name_len == 1)
3057 {
3058 /* No valid character, treat it as a normal
3059 character. */
3060 bitset_set (sbcset, name[0]);
3061 return REG_NOERROR;
3062 }
3063 else
3064 return REG_ECOLLATE;
3065
3066 /* Got valid collation sequence, add it as a new entry. */
3067 /* Check the space of the arrays. */
3068 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3069 {
3070 /* Not enough, realloc it. */
3071 /* +1 in case of mbcset->ncoll_syms is 0. */
3072 Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3073 /* Use realloc since mbcset->coll_syms is NULL
3074 if *alloc == 0. */
3075 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3076 new_coll_sym_alloc);
3077 if (BE (new_coll_syms == NULL, 0))
3078 return REG_ESPACE;
3079 mbcset->coll_syms = new_coll_syms;
3080 *coll_sym_alloc = new_coll_sym_alloc;
3081 }
3082 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3083 return REG_NOERROR;
3084 }
3085 else
3086 {
3087 if (BE (name_len != 1, 0))
3088 return REG_ECOLLATE;
3089 else
3090 {
3091 bitset_set (sbcset, name[0]);
3092 return REG_NOERROR;
3093 }
3094 }
3095 }
3096#endif
3097
3098 re_token_t br_token;
3099 re_bitset_ptr_t sbcset;
3100#ifdef RE_ENABLE_I18N
3101 re_charset_t *mbcset;
3102 Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3103 Idx equiv_class_alloc = 0, char_class_alloc = 0;
3104#endif /* not RE_ENABLE_I18N */
3105 bool non_match = false;
3106 bin_tree_t *work_tree;
3107 int token_len;
3108 bool first_round = true;
3109#ifdef _LIBC
3110 collseqmb = (const unsigned char *)
3111 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3112 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3113 if (nrules)
3114 {
3115 /*
3116 if (MB_CUR_MAX > 1)
3117 */
3118 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3119 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3120 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3121 _NL_COLLATE_SYMB_TABLEMB);
3122 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3123 _NL_COLLATE_SYMB_EXTRAMB);
3124 }
3125#endif
3126 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3127#ifdef RE_ENABLE_I18N
3128 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3129#endif /* RE_ENABLE_I18N */
3130#ifdef RE_ENABLE_I18N
3131 if (BE (sbcset == NULL || mbcset == NULL, 0))
3132#else
3133 if (BE (sbcset == NULL, 0))
3134#endif /* RE_ENABLE_I18N */
3135 {
005de2e8
LC
3136 re_free (sbcset);
3137#ifdef RE_ENABLE_I18N
3138 re_free (mbcset);
3139#endif
eb4a14ed
LC
3140 *err = REG_ESPACE;
3141 return NULL;
3142 }
3143
3144 token_len = peek_token_bracket (token, regexp, syntax);
3145 if (BE (token->type == END_OF_RE, 0))
3146 {
3147 *err = REG_BADPAT;
3148 goto parse_bracket_exp_free_return;
3149 }
3150 if (token->type == OP_NON_MATCH_LIST)
3151 {
3152#ifdef RE_ENABLE_I18N
3153 mbcset->non_match = 1;
3154#endif /* not RE_ENABLE_I18N */
3155 non_match = true;
3156 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3157 bitset_set (sbcset, '\n');
3158 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3159 token_len = peek_token_bracket (token, regexp, syntax);
3160 if (BE (token->type == END_OF_RE, 0))
3161 {
3162 *err = REG_BADPAT;
3163 goto parse_bracket_exp_free_return;
3164 }
3165 }
3166
3167 /* We treat the first ']' as a normal character. */
3168 if (token->type == OP_CLOSE_BRACKET)
3169 token->type = CHARACTER;
3170
3171 while (1)
3172 {
3173 bracket_elem_t start_elem, end_elem;
3174 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3175 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3176 reg_errcode_t ret;
3177 int token_len2 = 0;
3178 bool is_range_exp = false;
3179 re_token_t token2;
3180
3181 start_elem.opr.name = start_name_buf;
3182 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3183 syntax, first_round);
3184 if (BE (ret != REG_NOERROR, 0))
3185 {
3186 *err = ret;
3187 goto parse_bracket_exp_free_return;
3188 }
3189 first_round = false;
3190
3191 /* Get information about the next token. We need it in any case. */
3192 token_len = peek_token_bracket (token, regexp, syntax);
3193
3194 /* Do not check for ranges if we know they are not allowed. */
3195 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3196 {
3197 if (BE (token->type == END_OF_RE, 0))
3198 {
3199 *err = REG_EBRACK;
3200 goto parse_bracket_exp_free_return;
3201 }
3202 if (token->type == OP_CHARSET_RANGE)
3203 {
3204 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3205 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3206 if (BE (token2.type == END_OF_RE, 0))
3207 {
3208 *err = REG_EBRACK;
3209 goto parse_bracket_exp_free_return;
3210 }
3211 if (token2.type == OP_CLOSE_BRACKET)
3212 {
3213 /* We treat the last '-' as a normal character. */
3214 re_string_skip_bytes (regexp, -token_len);
3215 token->type = CHARACTER;
3216 }
3217 else
3218 is_range_exp = true;
3219 }
3220 }
3221
3222 if (is_range_exp == true)
3223 {
3224 end_elem.opr.name = end_name_buf;
3225 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3226 dfa, syntax, true);
3227 if (BE (ret != REG_NOERROR, 0))
3228 {
3229 *err = ret;
3230 goto parse_bracket_exp_free_return;
3231 }
3232
3233 token_len = peek_token_bracket (token, regexp, syntax);
3234
3235#ifdef _LIBC
3236 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3237 &start_elem, &end_elem);
3238#else
3239# ifdef RE_ENABLE_I18N
3240 *err = build_range_exp (syntax, sbcset,
3241 dfa->mb_cur_max > 1 ? mbcset : NULL,
3242 &range_alloc, &start_elem, &end_elem);
3243# else
3244 *err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
3245# endif
3246#endif /* RE_ENABLE_I18N */
3247 if (BE (*err != REG_NOERROR, 0))
3248 goto parse_bracket_exp_free_return;
3249 }
3250 else
3251 {
3252 switch (start_elem.type)
3253 {
3254 case SB_CHAR:
3255 bitset_set (sbcset, start_elem.opr.ch);
3256 break;
3257#ifdef RE_ENABLE_I18N
3258 case MB_CHAR:
3259 /* Check whether the array has enough space. */
3260 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3261 {
3262 wchar_t *new_mbchars;
3263 /* Not enough, realloc it. */
3264 /* +1 in case of mbcset->nmbchars is 0. */
3265 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3266 /* Use realloc since array is NULL if *alloc == 0. */
3267 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3268 mbchar_alloc);
3269 if (BE (new_mbchars == NULL, 0))
3270 goto parse_bracket_exp_espace;
3271 mbcset->mbchars = new_mbchars;
3272 }
3273 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3274 break;
3275#endif /* RE_ENABLE_I18N */
3276 case EQUIV_CLASS:
3277 *err = build_equiv_class (sbcset,
3278#ifdef RE_ENABLE_I18N
3279 mbcset, &equiv_class_alloc,
3280#endif /* RE_ENABLE_I18N */
3281 start_elem.opr.name);
3282 if (BE (*err != REG_NOERROR, 0))
3283 goto parse_bracket_exp_free_return;
3284 break;
3285 case COLL_SYM:
3286 *err = build_collating_symbol (sbcset,
3287#ifdef RE_ENABLE_I18N
3288 mbcset, &coll_sym_alloc,
3289#endif /* RE_ENABLE_I18N */
3290 start_elem.opr.name);
3291 if (BE (*err != REG_NOERROR, 0))
3292 goto parse_bracket_exp_free_return;
3293 break;
3294 case CHAR_CLASS:
3295 *err = build_charclass (regexp->trans, sbcset,
3296#ifdef RE_ENABLE_I18N
3297 mbcset, &char_class_alloc,
3298#endif /* RE_ENABLE_I18N */
3299 start_elem.opr.name, syntax);
3300 if (BE (*err != REG_NOERROR, 0))
3301 goto parse_bracket_exp_free_return;
3302 break;
3303 default:
3304 assert (0);
3305 break;
3306 }
3307 }
3308 if (BE (token->type == END_OF_RE, 0))
3309 {
3310 *err = REG_EBRACK;
3311 goto parse_bracket_exp_free_return;
3312 }
3313 if (token->type == OP_CLOSE_BRACKET)
3314 break;
3315 }
3316
3317 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3318
3319 /* If it is non-matching list. */
3320 if (non_match)
3321 bitset_not (sbcset);
3322
3323#ifdef RE_ENABLE_I18N
3324 /* Ensure only single byte characters are set. */
3325 if (dfa->mb_cur_max > 1)
3326 bitset_mask (sbcset, dfa->sb_char);
3327
3328 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3329 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3330 || mbcset->non_match)))
3331 {
3332 bin_tree_t *mbc_tree;
3333 int sbc_idx;
3334 /* Build a tree for complex bracket. */
3335 dfa->has_mb_node = 1;
3336 br_token.type = COMPLEX_BRACKET;
3337 br_token.opr.mbcset = mbcset;
3338 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3339 if (BE (mbc_tree == NULL, 0))
3340 goto parse_bracket_exp_espace;
3341 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3342 if (sbcset[sbc_idx])
3343 break;
3344 /* If there are no bits set in sbcset, there is no point
3345 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3346 if (sbc_idx < BITSET_WORDS)
3347 {
3348 /* Build a tree for simple bracket. */
3349 br_token.type = SIMPLE_BRACKET;
3350 br_token.opr.sbcset = sbcset;
3351 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3352 if (BE (work_tree == NULL, 0))
3353 goto parse_bracket_exp_espace;
3354
3355 /* Then join them by ALT node. */
3356 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3357 if (BE (work_tree == NULL, 0))
3358 goto parse_bracket_exp_espace;
3359 }
3360 else
3361 {
3362 re_free (sbcset);
3363 work_tree = mbc_tree;
3364 }
3365 }
3366 else
3367#endif /* not RE_ENABLE_I18N */
3368 {
3369#ifdef RE_ENABLE_I18N
3370 free_charset (mbcset);
3371#endif
3372 /* Build a tree for simple bracket. */
3373 br_token.type = SIMPLE_BRACKET;
3374 br_token.opr.sbcset = sbcset;
3375 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3376 if (BE (work_tree == NULL, 0))
3377 goto parse_bracket_exp_espace;
3378 }
3379 return work_tree;
3380
3381 parse_bracket_exp_espace:
3382 *err = REG_ESPACE;
3383 parse_bracket_exp_free_return:
3384 re_free (sbcset);
3385#ifdef RE_ENABLE_I18N
3386 free_charset (mbcset);
3387#endif /* RE_ENABLE_I18N */
3388 return NULL;
3389}
3390
3391/* Parse an element in the bracket expression. */
3392
3393static reg_errcode_t
3394parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3395 re_token_t *token, int token_len, re_dfa_t *dfa,
3396 reg_syntax_t syntax, bool accept_hyphen)
3397{
3398#ifdef RE_ENABLE_I18N
3399 int cur_char_size;
3400 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3401 if (cur_char_size > 1)
3402 {
3403 elem->type = MB_CHAR;
3404 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3405 re_string_skip_bytes (regexp, cur_char_size);
3406 return REG_NOERROR;
3407 }
3408#endif /* RE_ENABLE_I18N */
3409 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3410 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3411 || token->type == OP_OPEN_EQUIV_CLASS)
3412 return parse_bracket_symbol (elem, regexp, token);
3413 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3414 {
3415 /* A '-' must only appear as anything but a range indicator before
3416 the closing bracket. Everything else is an error. */
3417 re_token_t token2;
3418 (void) peek_token_bracket (&token2, regexp, syntax);
3419 if (token2.type != OP_CLOSE_BRACKET)
3420 /* The actual error value is not standardized since this whole
3421 case is undefined. But ERANGE makes good sense. */
3422 return REG_ERANGE;
3423 }
3424 elem->type = SB_CHAR;
3425 elem->opr.ch = token->opr.c;
3426 return REG_NOERROR;
3427}
3428
3429/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3430 such as [:<character_class>:], [.<collating_element>.], and
3431 [=<equivalent_class>=]. */
3432
3433static reg_errcode_t
3434parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3435 re_token_t *token)
3436{
3437 unsigned char ch, delim = token->opr.c;
3438 int i = 0;
3439 if (re_string_eoi(regexp))
3440 return REG_EBRACK;
3441 for (;; ++i)
3442 {
3443 if (i >= BRACKET_NAME_BUF_SIZE)
3444 return REG_EBRACK;
3445 if (token->type == OP_OPEN_CHAR_CLASS)
3446 ch = re_string_fetch_byte_case (regexp);
3447 else
3448 ch = re_string_fetch_byte (regexp);
3449 if (re_string_eoi(regexp))
3450 return REG_EBRACK;
3451 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3452 break;
3453 elem->opr.name[i] = ch;
3454 }
3455 re_string_skip_bytes (regexp, 1);
3456 elem->opr.name[i] = '\0';
3457 switch (token->type)
3458 {
3459 case OP_OPEN_COLL_ELEM:
3460 elem->type = COLL_SYM;
3461 break;
3462 case OP_OPEN_EQUIV_CLASS:
3463 elem->type = EQUIV_CLASS;
3464 break;
3465 case OP_OPEN_CHAR_CLASS:
3466 elem->type = CHAR_CLASS;
3467 break;
3468 default:
3469 break;
3470 }
3471 return REG_NOERROR;
3472}
3473
3474 /* Helper function for parse_bracket_exp.
3475 Build the equivalence class which is represented by NAME.
3476 The result are written to MBCSET and SBCSET.
3477 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
005de2e8 3478 is a pointer argument since we may update it. */
eb4a14ed
LC
3479
3480static reg_errcode_t
3481#ifdef RE_ENABLE_I18N
3482build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3483 Idx *equiv_class_alloc, const unsigned char *name)
3484#else /* not RE_ENABLE_I18N */
3485build_equiv_class (bitset_t sbcset, const unsigned char *name)
3486#endif /* not RE_ENABLE_I18N */
3487{
3488#ifdef _LIBC
3489 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3490 if (nrules != 0)
3491 {
3492 const int32_t *table, *indirect;
3493 const unsigned char *weights, *extra, *cp;
3494 unsigned char char_buf[2];
3495 int32_t idx1, idx2;
3496 unsigned int ch;
3497 size_t len;
3498 /* This #include defines a local function! */
3499# include <locale/weight.h>
3500 /* Calculate the index for equivalence class. */
3501 cp = name;
3502 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3503 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3504 _NL_COLLATE_WEIGHTMB);
3505 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3506 _NL_COLLATE_EXTRAMB);
3507 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3508 _NL_COLLATE_INDIRECTMB);
005de2e8
LC
3509 idx1 = findidx (&cp, -1);
3510 if (BE (idx1 == 0 || *cp != '\0', 0))
eb4a14ed
LC
3511 /* This isn't a valid character. */
3512 return REG_ECOLLATE;
3513
005de2e8 3514 /* Build single byte matching table for this equivalence class. */
eb4a14ed
LC
3515 len = weights[idx1 & 0xffffff];
3516 for (ch = 0; ch < SBC_MAX; ++ch)
3517 {
3518 char_buf[0] = ch;
3519 cp = char_buf;
005de2e8 3520 idx2 = findidx (&cp, 1);
eb4a14ed
LC
3521/*
3522 idx2 = table[ch];
3523*/
3524 if (idx2 == 0)
3525 /* This isn't a valid character. */
3526 continue;
3527 /* Compare only if the length matches and the collation rule
3528 index is the same. */
3529 if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3530 {
3531 int cnt = 0;
3532
3533 while (cnt <= len &&
3534 weights[(idx1 & 0xffffff) + 1 + cnt]
3535 == weights[(idx2 & 0xffffff) + 1 + cnt])
3536 ++cnt;
3537
3538 if (cnt > len)
3539 bitset_set (sbcset, ch);
3540 }
3541 }
3542 /* Check whether the array has enough space. */
3543 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3544 {
3545 /* Not enough, realloc it. */
3546 /* +1 in case of mbcset->nequiv_classes is 0. */
3547 Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3548 /* Use realloc since the array is NULL if *alloc == 0. */
3549 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3550 int32_t,
3551 new_equiv_class_alloc);
3552 if (BE (new_equiv_classes == NULL, 0))
3553 return REG_ESPACE;
3554 mbcset->equiv_classes = new_equiv_classes;
3555 *equiv_class_alloc = new_equiv_class_alloc;
3556 }
3557 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3558 }
3559 else
3560#endif /* _LIBC */
3561 {
3562 if (BE (strlen ((const char *) name) != 1, 0))
3563 return REG_ECOLLATE;
3564 bitset_set (sbcset, *name);
3565 }
3566 return REG_NOERROR;
3567}
3568
3569 /* Helper function for parse_bracket_exp.
3570 Build the character class which is represented by NAME.
3571 The result are written to MBCSET and SBCSET.
3572 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
005de2e8 3573 is a pointer argument since we may update it. */
eb4a14ed
LC
3574
3575static reg_errcode_t
3576#ifdef RE_ENABLE_I18N
3577build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3578 re_charset_t *mbcset, Idx *char_class_alloc,
3579 const unsigned char *class_name, reg_syntax_t syntax)
3580#else /* not RE_ENABLE_I18N */
3581build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3582 const unsigned char *class_name, reg_syntax_t syntax)
3583#endif /* not RE_ENABLE_I18N */
3584{
3585 int i;
3586 const char *name = (const char *) class_name;
3587
3588 /* In case of REG_ICASE "upper" and "lower" match the both of
3589 upper and lower cases. */
3590 if ((syntax & RE_ICASE)
3591 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3592 name = "alpha";
3593
3594#ifdef RE_ENABLE_I18N
3595 /* Check the space of the arrays. */
3596 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3597 {
3598 /* Not enough, realloc it. */
3599 /* +1 in case of mbcset->nchar_classes is 0. */
3600 Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3601 /* Use realloc since array is NULL if *alloc == 0. */
3602 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3603 new_char_class_alloc);
3604 if (BE (new_char_classes == NULL, 0))
3605 return REG_ESPACE;
3606 mbcset->char_classes = new_char_classes;
3607 *char_class_alloc = new_char_class_alloc;
3608 }
3609 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3610#endif /* RE_ENABLE_I18N */
3611
3612#define BUILD_CHARCLASS_LOOP(ctype_func) \
3613 do { \
3614 if (BE (trans != NULL, 0)) \
3615 { \
3616 for (i = 0; i < SBC_MAX; ++i) \
3617 if (ctype_func (i)) \
3618 bitset_set (sbcset, trans[i]); \
3619 } \
3620 else \
3621 { \
3622 for (i = 0; i < SBC_MAX; ++i) \
3623 if (ctype_func (i)) \
3624 bitset_set (sbcset, i); \
3625 } \
3626 } while (0)
3627
3628 if (strcmp (name, "alnum") == 0)
3629 BUILD_CHARCLASS_LOOP (isalnum);
3630 else if (strcmp (name, "cntrl") == 0)
3631 BUILD_CHARCLASS_LOOP (iscntrl);
3632 else if (strcmp (name, "lower") == 0)
3633 BUILD_CHARCLASS_LOOP (islower);
3634 else if (strcmp (name, "space") == 0)
3635 BUILD_CHARCLASS_LOOP (isspace);
3636 else if (strcmp (name, "alpha") == 0)
3637 BUILD_CHARCLASS_LOOP (isalpha);
3638 else if (strcmp (name, "digit") == 0)
3639 BUILD_CHARCLASS_LOOP (isdigit);
3640 else if (strcmp (name, "print") == 0)
3641 BUILD_CHARCLASS_LOOP (isprint);
3642 else if (strcmp (name, "upper") == 0)
3643 BUILD_CHARCLASS_LOOP (isupper);
3644 else if (strcmp (name, "blank") == 0)
3645 BUILD_CHARCLASS_LOOP (isblank);
3646 else if (strcmp (name, "graph") == 0)
3647 BUILD_CHARCLASS_LOOP (isgraph);
3648 else if (strcmp (name, "punct") == 0)
3649 BUILD_CHARCLASS_LOOP (ispunct);
3650 else if (strcmp (name, "xdigit") == 0)
3651 BUILD_CHARCLASS_LOOP (isxdigit);
3652 else
3653 return REG_ECTYPE;
3654
3655 return REG_NOERROR;
3656}
3657
3658static bin_tree_t *
3659build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3660 const unsigned char *class_name,
3661 const unsigned char *extra, bool non_match,
3662 reg_errcode_t *err)
3663{
3664 re_bitset_ptr_t sbcset;
3665#ifdef RE_ENABLE_I18N
3666 re_charset_t *mbcset;
3667 Idx alloc = 0;
3668#endif /* not RE_ENABLE_I18N */
3669 reg_errcode_t ret;
3670 re_token_t br_token;
3671 bin_tree_t *tree;
3672
3673 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3674#ifdef RE_ENABLE_I18N
3675 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3676#endif /* RE_ENABLE_I18N */
3677
3678#ifdef RE_ENABLE_I18N
3679 if (BE (sbcset == NULL || mbcset == NULL, 0))
3680#else /* not RE_ENABLE_I18N */
3681 if (BE (sbcset == NULL, 0))
3682#endif /* not RE_ENABLE_I18N */
3683 {
3684 *err = REG_ESPACE;
3685 return NULL;
3686 }
3687
3688 if (non_match)
3689 {
3690#ifdef RE_ENABLE_I18N
3691 mbcset->non_match = 1;
3692#endif /* not RE_ENABLE_I18N */
3693 }
3694
3695 /* We don't care the syntax in this case. */
3696 ret = build_charclass (trans, sbcset,
3697#ifdef RE_ENABLE_I18N
3698 mbcset, &alloc,
3699#endif /* RE_ENABLE_I18N */
3700 class_name, 0);
3701
3702 if (BE (ret != REG_NOERROR, 0))
3703 {
3704 re_free (sbcset);
3705#ifdef RE_ENABLE_I18N
3706 free_charset (mbcset);
3707#endif /* RE_ENABLE_I18N */
3708 *err = ret;
3709 return NULL;
3710 }
3711 /* \w match '_' also. */
3712 for (; *extra; extra++)
3713 bitset_set (sbcset, *extra);
3714
3715 /* If it is non-matching list. */
3716 if (non_match)
3717 bitset_not (sbcset);
3718
3719#ifdef RE_ENABLE_I18N
3720 /* Ensure only single byte characters are set. */
3721 if (dfa->mb_cur_max > 1)
3722 bitset_mask (sbcset, dfa->sb_char);
3723#endif
3724
3725 /* Build a tree for simple bracket. */
3726 br_token.type = SIMPLE_BRACKET;
3727 br_token.opr.sbcset = sbcset;
3728 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3729 if (BE (tree == NULL, 0))
3730 goto build_word_op_espace;
3731
3732#ifdef RE_ENABLE_I18N
3733 if (dfa->mb_cur_max > 1)
3734 {
3735 bin_tree_t *mbc_tree;
3736 /* Build a tree for complex bracket. */
3737 br_token.type = COMPLEX_BRACKET;
3738 br_token.opr.mbcset = mbcset;
3739 dfa->has_mb_node = 1;
3740 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3741 if (BE (mbc_tree == NULL, 0))
3742 goto build_word_op_espace;
3743 /* Then join them by ALT node. */
3744 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3745 if (BE (mbc_tree != NULL, 1))
3746 return tree;
3747 }
3748 else
3749 {
3750 free_charset (mbcset);
3751 return tree;
3752 }
3753#else /* not RE_ENABLE_I18N */
3754 return tree;
3755#endif /* not RE_ENABLE_I18N */
3756
3757 build_word_op_espace:
3758 re_free (sbcset);
3759#ifdef RE_ENABLE_I18N
3760 free_charset (mbcset);
3761#endif /* RE_ENABLE_I18N */
3762 *err = REG_ESPACE;
3763 return NULL;
3764}
3765
3766/* This is intended for the expressions like "a{1,3}".
3767 Fetch a number from 'input', and return the number.
3768 Return REG_MISSING if the number field is empty like "{,1}".
005de2e8 3769 Return RE_DUP_MAX + 1 if the number field is too large.
eb4a14ed
LC
3770 Return REG_ERROR if an error occurred. */
3771
3772static Idx
3773fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3774{
3775 Idx num = REG_MISSING;
3776 unsigned char c;
3777 while (1)
3778 {
3779 fetch_token (token, input, syntax);
3780 c = token->opr.c;
3781 if (BE (token->type == END_OF_RE, 0))
3782 return REG_ERROR;
3783 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3784 break;
3785 num = ((token->type != CHARACTER || c < '0' || '9' < c
3786 || num == REG_ERROR)
3787 ? REG_ERROR
005de2e8
LC
3788 : num == REG_MISSING
3789 ? c - '0'
3790 : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
eb4a14ed
LC
3791 }
3792 return num;
3793}
3794\f
3795#ifdef RE_ENABLE_I18N
3796static void
3797free_charset (re_charset_t *cset)
3798{
3799 re_free (cset->mbchars);
3800# ifdef _LIBC
3801 re_free (cset->coll_syms);
3802 re_free (cset->equiv_classes);
3803 re_free (cset->range_starts);
3804 re_free (cset->range_ends);
3805# endif
3806 re_free (cset->char_classes);
3807 re_free (cset);
3808}
3809#endif /* RE_ENABLE_I18N */
3810\f
3811/* Functions for binary tree operation. */
3812
3813/* Create a tree node. */
3814
3815static bin_tree_t *
3816create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3817 re_token_type_t type)
3818{
3819 re_token_t t;
3820 t.type = type;
3821 return create_token_tree (dfa, left, right, &t);
3822}
3823
3824static bin_tree_t *
3825create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3826 const re_token_t *token)
3827{
3828 bin_tree_t *tree;
3829 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3830 {
3831 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3832
3833 if (storage == NULL)
3834 return NULL;
3835 storage->next = dfa->str_tree_storage;
3836 dfa->str_tree_storage = storage;
3837 dfa->str_tree_storage_idx = 0;
3838 }
3839 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3840
3841 tree->parent = NULL;
3842 tree->left = left;
3843 tree->right = right;
3844 tree->token = *token;
3845 tree->token.duplicated = 0;
3846 tree->token.opt_subexp = 0;
3847 tree->first = NULL;
3848 tree->next = NULL;
3849 tree->node_idx = REG_MISSING;
3850
3851 if (left != NULL)
3852 left->parent = tree;
3853 if (right != NULL)
3854 right->parent = tree;
3855 return tree;
3856}
3857
3858/* Mark the tree SRC as an optional subexpression.
3859 To be called from preorder or postorder. */
3860
3861static reg_errcode_t
3862mark_opt_subexp (void *extra, bin_tree_t *node)
3863{
005de2e8 3864 Idx idx = (uintptr_t) extra;
eb4a14ed
LC
3865 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3866 node->token.opt_subexp = 1;
3867
3868 return REG_NOERROR;
3869}
3870
3871/* Free the allocated memory inside NODE. */
3872
3873static void
3874free_token (re_token_t *node)
3875{
3876#ifdef RE_ENABLE_I18N
3877 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3878 free_charset (node->opr.mbcset);
3879 else
3880#endif /* RE_ENABLE_I18N */
3881 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3882 re_free (node->opr.sbcset);
3883}
3884
3885/* Worker function for tree walking. Free the allocated memory inside NODE
3886 and its children. */
3887
3888static reg_errcode_t
3889free_tree (void *extra, bin_tree_t *node)
3890{
3891 free_token (&node->token);
3892 return REG_NOERROR;
3893}
3894
3895
3896/* Duplicate the node SRC, and return new node. This is a preorder
3897 visit similar to the one implemented by the generic visitor, but
3898 we need more infrastructure to maintain two parallel trees --- so,
3899 it's easier to duplicate. */
3900
3901static bin_tree_t *
3902duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3903{
3904 const bin_tree_t *node;
3905 bin_tree_t *dup_root;
3906 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3907
3908 for (node = root; ; )
3909 {
3910 /* Create a new tree and link it back to the current parent. */
3911 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3912 if (*p_new == NULL)
3913 return NULL;
3914 (*p_new)->parent = dup_node;
3915 (*p_new)->token.duplicated = 1;
3916 dup_node = *p_new;
3917
3918 /* Go to the left node, or up and to the right. */
3919 if (node->left)
3920 {
3921 node = node->left;
3922 p_new = &dup_node->left;
3923 }
3924 else
3925 {
3926 const bin_tree_t *prev = NULL;
3927 while (node->right == prev || node->right == NULL)
3928 {
3929 prev = node;
3930 node = node->parent;
3931 dup_node = dup_node->parent;
3932 if (!node)
3933 return dup_root;
3934 }
3935 node = node->right;
3936 p_new = &dup_node->right;
3937 }
3938 }
3939}