HCoop Git - bpt/guile.git/blame_incremental

... / ...

Commit	Line	Data
	1	/* Extended regular expression matching and search library.
	2	Copyright (C) 2002-2013 Free Software Foundation, Inc.
	3	This file is part of the GNU C Library.
	4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
	5
	6	The GNU C Library is free software; you can redistribute it and/or
	7	modify it under the terms of the GNU Lesser General Public
	8	License as published by the Free Software Foundation; either
	9	version 2.1 of the License, or (at your option) any later version.
	10
	11	The GNU C Library is distributed in the hope that it will be useful,
	12	but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public
	17	License along with the GNU C Library; if not, see
	18	<http://www.gnu.org/licenses/>. */
	19
	20	static reg_errcode_t re_compile_internal (regex_t preg, const char pattern,
	21	size_t length, reg_syntax_t syntax);
	22	static void re_compile_fastmap_iter (regex_t *bufp,
	23	const re_dfastate_t *init_state,
	24	char *fastmap);
	25	static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
	26	#ifdef RE_ENABLE_I18N
	27	static void free_charset (re_charset_t *cset);
	28	#endif /* RE_ENABLE_I18N */
	29	static void free_workarea_compile (regex_t *preg);
	30	static reg_errcode_t create_initial_state (re_dfa_t *dfa);
	31	#ifdef RE_ENABLE_I18N
	32	static void optimize_utf8 (re_dfa_t *dfa);
	33	#endif
	34	static reg_errcode_t analyze (regex_t *preg);
	35	static reg_errcode_t preorder (bin_tree_t *root,
	36	reg_errcode_t (fn (void , bin_tree_t )),
	37	void *extra);
	38	static reg_errcode_t postorder (bin_tree_t *root,
	39	reg_errcode_t (fn (void , bin_tree_t )),
	40	void *extra);
	41	static reg_errcode_t optimize_subexps (void extra, bin_tree_t node);
	42	static reg_errcode_t lower_subexps (void extra, bin_tree_t node);
	43	static bin_tree_t lower_subexp (reg_errcode_t err, regex_t *preg,
	44	bin_tree_t *node);
	45	static reg_errcode_t calc_first (void extra, bin_tree_t node);
	46	static reg_errcode_t calc_next (void extra, bin_tree_t node);
	47	static reg_errcode_t link_nfa_nodes (void extra, bin_tree_t node);
	48	static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
	49	static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
	50	unsigned int constraint);
	51	static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
	52	static reg_errcode_t calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa,
	53	Idx node, bool root);
	54	static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
	55	static Idx fetch_number (re_string_t input, re_token_t token,
	56	reg_syntax_t syntax);
	57	static int peek_token (re_token_t token, re_string_t input,
	58	reg_syntax_t syntax) internal_function;
	59	static bin_tree_t parse (re_string_t regexp, regex_t *preg,
	60	reg_syntax_t syntax, reg_errcode_t *err);
	61	static bin_tree_t parse_reg_exp (re_string_t regexp, regex_t *preg,
	62	re_token_t *token, reg_syntax_t syntax,
	63	Idx nest, reg_errcode_t *err);
	64	static bin_tree_t parse_branch (re_string_t regexp, regex_t *preg,
	65	re_token_t *token, reg_syntax_t syntax,
	66	Idx nest, reg_errcode_t *err);
	67	static bin_tree_t parse_expression (re_string_t regexp, regex_t *preg,
	68	re_token_t *token, reg_syntax_t syntax,
	69	Idx nest, reg_errcode_t *err);
	70	static bin_tree_t parse_sub_exp (re_string_t regexp, regex_t *preg,
	71	re_token_t *token, reg_syntax_t syntax,
	72	Idx nest, reg_errcode_t *err);
	73	static bin_tree_t parse_dup_op (bin_tree_t dup_elem, re_string_t *regexp,
	74	re_dfa_t dfa, re_token_t token,
	75	reg_syntax_t syntax, reg_errcode_t *err);
	76	static bin_tree_t parse_bracket_exp (re_string_t regexp, re_dfa_t *dfa,
	77	re_token_t *token, reg_syntax_t syntax,
	78	reg_errcode_t *err);
	79	static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
	80	re_string_t *regexp,
	81	re_token_t *token, int token_len,
	82	re_dfa_t *dfa,
	83	reg_syntax_t syntax,
	84	bool accept_hyphen);
	85	static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
	86	re_string_t *regexp,
	87	re_token_t *token);
	88	#ifdef RE_ENABLE_I18N
	89	static reg_errcode_t build_equiv_class (bitset_t sbcset,
	90	re_charset_t *mbcset,
	91	Idx *equiv_class_alloc,
	92	const unsigned char *name);
	93	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
	94	bitset_t sbcset,
	95	re_charset_t *mbcset,
	96	Idx *char_class_alloc,
	97	const char *class_name,
	98	reg_syntax_t syntax);
	99	#else /* not RE_ENABLE_I18N */
	100	static reg_errcode_t build_equiv_class (bitset_t sbcset,
	101	const unsigned char *name);
	102	static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
	103	bitset_t sbcset,
	104	const char *class_name,
	105	reg_syntax_t syntax);
	106	#endif /* not RE_ENABLE_I18N */
	107	static bin_tree_t build_charclass_op (re_dfa_t dfa,
	108	RE_TRANSLATE_TYPE trans,
	109	const char *class_name,
	110	const char *extra,
	111	bool non_match, reg_errcode_t *err);
	112	static bin_tree_t create_tree (re_dfa_t dfa,
	113	bin_tree_t left, bin_tree_t right,
	114	re_token_type_t type);
	115	static bin_tree_t create_token_tree (re_dfa_t dfa,
	116	bin_tree_t left, bin_tree_t right,
	117	const re_token_t *token);
	118	static bin_tree_t duplicate_tree (const bin_tree_t src, re_dfa_t *dfa);
	119	static void free_token (re_token_t *node);
	120	static reg_errcode_t free_tree (void extra, bin_tree_t node);
	121	static reg_errcode_t mark_opt_subexp (void extra, bin_tree_t node);
	122	\f
	123	/* This table gives an error message for each of the error codes listed
	124	in regex.h. Obviously the order here has to be same as there.
	125	POSIX doesn't require that we do anything for REG_NOERROR,
	126	but why not be nice? */
	127
	128	static const char __re_error_msgid[] =
	129	{
	130	#define REG_NOERROR_IDX 0
	131	gettext_noop ("Success") /* REG_NOERROR */
	132	"\0"
	133	#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
	134	gettext_noop ("No match") /* REG_NOMATCH */
	135	"\0"
	136	#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
	137	gettext_noop ("Invalid regular expression") /* REG_BADPAT */
	138	"\0"
	139	#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
	140	gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
	141	"\0"
	142	#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
	143	gettext_noop ("Invalid character class name") /* REG_ECTYPE */
	144	"\0"
	145	#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
	146	gettext_noop ("Trailing backslash") /* REG_EESCAPE */
	147	"\0"
	148	#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
	149	gettext_noop ("Invalid back reference") /* REG_ESUBREG */
	150	"\0"
	151	#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
	152	gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
	153	"\0"
	154	#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
	155	gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
	156	"\0"
	157	#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
	158	gettext_noop ("Unmatched \\{") /* REG_EBRACE */
	159	"\0"
	160	#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
	161	gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
	162	"\0"
	163	#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
	164	gettext_noop ("Invalid range end") /* REG_ERANGE */
	165	"\0"
	166	#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
	167	gettext_noop ("Memory exhausted") /* REG_ESPACE */
	168	"\0"
	169	#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
	170	gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
	171	"\0"
	172	#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
	173	gettext_noop ("Premature end of regular expression") /* REG_EEND */
	174	"\0"
	175	#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
	176	gettext_noop ("Regular expression too big") /* REG_ESIZE */
	177	"\0"
	178	#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
	179	gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
	180	};
	181
	182	static const size_t __re_error_msgid_idx[] =
	183	{
	184	REG_NOERROR_IDX,
	185	REG_NOMATCH_IDX,
	186	REG_BADPAT_IDX,
	187	REG_ECOLLATE_IDX,
	188	REG_ECTYPE_IDX,
	189	REG_EESCAPE_IDX,
	190	REG_ESUBREG_IDX,
	191	REG_EBRACK_IDX,
	192	REG_EPAREN_IDX,
	193	REG_EBRACE_IDX,
	194	REG_BADBR_IDX,
	195	REG_ERANGE_IDX,
	196	REG_ESPACE_IDX,
	197	REG_BADRPT_IDX,
	198	REG_EEND_IDX,
	199	REG_ESIZE_IDX,
	200	REG_ERPAREN_IDX
	201	};
	202	\f
	203	/* Entry points for GNU code. */
	204
	205	/* re_compile_pattern is the GNU regular expression compiler: it
	206	compiles PATTERN (of length LENGTH) and puts the result in BUFP.
	207	Returns 0 if the pattern was valid, otherwise an error string.
	208
	209	Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
	210	are set in BUFP on entry. */
	211
	212	#ifdef _LIBC
	213	const char *
	214	re_compile_pattern (pattern, length, bufp)
	215	const char *pattern;
	216	size_t length;
	217	struct re_pattern_buffer *bufp;
	218	#else /* size_t might promote */
	219	const char *
	220	re_compile_pattern (const char *pattern, size_t length,
	221	struct re_pattern_buffer *bufp)
	222	#endif
	223	{
	224	reg_errcode_t ret;
	225
	226	/* And GNU code determines whether or not to get register information
	227	by passing null for the REGS argument to re_match, etc., not by
	228	setting no_sub, unless RE_NO_SUB is set. */
	229	bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
	230
	231	/* Match anchors at newline. */
	232	bufp->newline_anchor = 1;
	233
	234	ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
	235
	236	if (!ret)
	237	return NULL;
	238	return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
	239	}
	240	#ifdef _LIBC
	241	weak_alias (__re_compile_pattern, re_compile_pattern)
	242	#endif
	243
	244	/* Set by 're_set_syntax' to the current regexp syntax to recognize. Can
	245	also be assigned to arbitrarily: each pattern buffer stores its own
	246	syntax, so it can be changed between regex compilations. */
	247	/* This has no initializer because initialized variables in Emacs
	248	become read-only after dumping. */
	249	reg_syntax_t re_syntax_options;
	250
	251
	252	/* Specify the precise syntax of regexps for compilation. This provides
	253	for compatibility for various utilities which historically have
	254	different, incompatible syntaxes.
	255
	256	The argument SYNTAX is a bit mask comprised of the various bits
	257	defined in regex.h. We return the old syntax. */
	258
	259	reg_syntax_t
	260	re_set_syntax (syntax)
	261	reg_syntax_t syntax;
	262	{
	263	reg_syntax_t ret = re_syntax_options;
	264
	265	re_syntax_options = syntax;
	266	return ret;
	267	}
	268	#ifdef _LIBC
	269	weak_alias (__re_set_syntax, re_set_syntax)
	270	#endif
	271
	272	int
	273	re_compile_fastmap (bufp)
	274	struct re_pattern_buffer *bufp;
	275	{
	276	re_dfa_t *dfa = bufp->buffer;
	277	char *fastmap = bufp->fastmap;
	278
	279	memset (fastmap, '\0', sizeof (char) * SBC_MAX);
	280	re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
	281	if (dfa->init_state != dfa->init_state_word)
	282	re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
	283	if (dfa->init_state != dfa->init_state_nl)
	284	re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
	285	if (dfa->init_state != dfa->init_state_begbuf)
	286	re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
	287	bufp->fastmap_accurate = 1;
	288	return 0;
	289	}
	290	#ifdef _LIBC
	291	weak_alias (__re_compile_fastmap, re_compile_fastmap)
	292	#endif
	293
	294	static inline void
	295	__attribute ((always_inline))
	296	re_set_fastmap (char *fastmap, bool icase, int ch)
	297	{
	298	fastmap[ch] = 1;
	299	if (icase)
	300	fastmap[tolower (ch)] = 1;
	301	}
	302
	303	/* Helper function for re_compile_fastmap.
	304	Compile fastmap for the initial_state INIT_STATE. */
	305
	306	static void
	307	re_compile_fastmap_iter (regex_t bufp, const re_dfastate_t init_state,
	308	char *fastmap)
	309	{
	310	re_dfa_t *dfa = bufp->buffer;
	311	Idx node_cnt;
	312	bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
	313	for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
	314	{
	315	Idx node = init_state->nodes.elems[node_cnt];
	316	re_token_type_t type = dfa->nodes[node].type;
	317
	318	if (type == CHARACTER)
	319	{
	320	re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
	321	#ifdef RE_ENABLE_I18N
	322	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
	323	{
	324	unsigned char buf[MB_LEN_MAX];
	325	unsigned char *p;
	326	wchar_t wc;
	327	mbstate_t state;
	328
	329	p = buf;
	330	*p++ = dfa->nodes[node].opr.c;
	331	while (++node < dfa->nodes_len
	332	&& dfa->nodes[node].type == CHARACTER
	333	&& dfa->nodes[node].mb_partial)
	334	*p++ = dfa->nodes[node].opr.c;
	335	memset (&state, '\0', sizeof (state));
	336	if (__mbrtowc (&wc, (const char *) buf, p - buf,
	337	&state) == p - buf
	338	&& (__wcrtomb ((char *) buf, towlower (wc), &state)
	339	!= (size_t) -1))
	340	re_set_fastmap (fastmap, false, buf[0]);
	341	}
	342	#endif
	343	}
	344	else if (type == SIMPLE_BRACKET)
	345	{
	346	int i, ch;
	347	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
	348	{
	349	int j;
	350	bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
	351	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
	352	if (w & ((bitset_word_t) 1 << j))
	353	re_set_fastmap (fastmap, icase, ch);
	354	}
	355	}
	356	#ifdef RE_ENABLE_I18N
	357	else if (type == COMPLEX_BRACKET)
	358	{
	359	re_charset_t *cset = dfa->nodes[node].opr.mbcset;
	360	Idx i;
	361
	362	# ifdef _LIBC
	363	/* See if we have to try all bytes which start multiple collation
	364	elements.
	365	e.g. In da_DK, we want to catch 'a' since "aa" is a valid
	366	collation element, and don't catch 'b' since 'b' is
	367	the only collation element which starts from 'b' (and
	368	it is caught by SIMPLE_BRACKET). */
	369	if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
	370	&& (cset->ncoll_syms \|\| cset->nranges))
	371	{
	372	const int32_t table = (const int32_t )
	373	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
	374	for (i = 0; i < SBC_MAX; ++i)
	375	if (table[i] < 0)
	376	re_set_fastmap (fastmap, icase, i);
	377	}
	378	# endif /* _LIBC */
	379
	380	/* See if we have to start the match at all multibyte characters,
	381	i.e. where we would not find an invalid sequence. This only
	382	applies to multibyte character sets; for single byte character
	383	sets, the SIMPLE_BRACKET again suffices. */
	384	if (dfa->mb_cur_max > 1
	385	&& (cset->nchar_classes \|\| cset->non_match \|\| cset->nranges
	386	# ifdef _LIBC
	387	\|\| cset->nequiv_classes
	388	# endif /* _LIBC */
	389	))
	390	{
	391	unsigned char c = 0;
	392	do
	393	{
	394	mbstate_t mbs;
	395	memset (&mbs, 0, sizeof (mbs));
	396	if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
	397	re_set_fastmap (fastmap, false, (int) c);
	398	}
	399	while (++c != 0);
	400	}
	401
	402	else
	403	{
	404	/* ... Else catch all bytes which can start the mbchars. */
	405	for (i = 0; i < cset->nmbchars; ++i)
	406	{
	407	char buf[256];
	408	mbstate_t state;
	409	memset (&state, '\0', sizeof (state));
	410	if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
	411	re_set_fastmap (fastmap, icase, (unsigned char ) buf);
	412	if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
	413	{
	414	if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
	415	!= (size_t) -1)
	416	re_set_fastmap (fastmap, false, (unsigned char ) buf);
	417	}
	418	}
	419	}
	420	}
	421	#endif /* RE_ENABLE_I18N */
	422	else if (type == OP_PERIOD
	423	#ifdef RE_ENABLE_I18N
	424	\|\| type == OP_UTF8_PERIOD
	425	#endif /* RE_ENABLE_I18N */
	426	\|\| type == END_OF_RE)
	427	{
	428	memset (fastmap, '\1', sizeof (char) * SBC_MAX);
	429	if (type == END_OF_RE)
	430	bufp->can_be_null = 1;
	431	return;
	432	}
	433	}
	434	}
	435	\f
	436	/* Entry point for POSIX code. */
	437	/* regcomp takes a regular expression as a string and compiles it.
	438
	439	PREG is a regex_t *. We do not expect any fields to be initialized,
	440	since POSIX says we shouldn't. Thus, we set
	441
	442	'buffer' to the compiled pattern;
	443	'used' to the length of the compiled pattern;
	444	'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
	445	REG_EXTENDED bit in CFLAGS is set; otherwise, to
	446	RE_SYNTAX_POSIX_BASIC;
	447	'newline_anchor' to REG_NEWLINE being set in CFLAGS;
	448	'fastmap' to an allocated space for the fastmap;
	449	'fastmap_accurate' to zero;
	450	're_nsub' to the number of subexpressions in PATTERN.
	451
	452	PATTERN is the address of the pattern string.
	453
	454	CFLAGS is a series of bits which affect compilation.
	455
	456	If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
	457	use POSIX basic syntax.
	458
	459	If REG_NEWLINE is set, then . and [^...] don't match newline.
	460	Also, regexec will try a match beginning after every newline.
	461
	462	If REG_ICASE is set, then we considers upper- and lowercase
	463	versions of letters to be equivalent when matching.
	464
	465	If REG_NOSUB is set, then when PREG is passed to regexec, that
	466	routine will report only success or failure, and nothing about the
	467	registers.
	468
	469	It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
	470	the return codes and their meanings.) */
	471
	472	int
	473	regcomp (preg, pattern, cflags)
	474	regex_t *_Restrict_ preg;
	475	const char *_Restrict_ pattern;
	476	int cflags;
	477	{
	478	reg_errcode_t ret;
	479	reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
	480	: RE_SYNTAX_POSIX_BASIC);
	481
	482	preg->buffer = NULL;
	483	preg->allocated = 0;
	484	preg->used = 0;
	485
	486	/* Try to allocate space for the fastmap. */
	487	preg->fastmap = re_malloc (char, SBC_MAX);
	488	if (BE (preg->fastmap == NULL, 0))
	489	return REG_ESPACE;
	490
	491	syntax \|= (cflags & REG_ICASE) ? RE_ICASE : 0;
	492
	493	/* If REG_NEWLINE is set, newlines are treated differently. */
	494	if (cflags & REG_NEWLINE)
	495	{ /* REG_NEWLINE implies neither . nor [^...] match newline. */
	496	syntax &= ~RE_DOT_NEWLINE;
	497	syntax \|= RE_HAT_LISTS_NOT_NEWLINE;
	498	/* It also changes the matching behavior. */
	499	preg->newline_anchor = 1;
	500	}
	501	else
	502	preg->newline_anchor = 0;
	503	preg->no_sub = !!(cflags & REG_NOSUB);
	504	preg->translate = NULL;
	505
	506	ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
	507
	508	/* POSIX doesn't distinguish between an unmatched open-group and an
	509	unmatched close-group: both are REG_EPAREN. */
	510	if (ret == REG_ERPAREN)
	511	ret = REG_EPAREN;
	512
	513	/* We have already checked preg->fastmap != NULL. */
	514	if (BE (ret == REG_NOERROR, 1))
	515	/* Compute the fastmap now, since regexec cannot modify the pattern
	516	buffer. This function never fails in this implementation. */
	517	(void) re_compile_fastmap (preg);
	518	else
	519	{
	520	/* Some error occurred while compiling the expression. */
	521	re_free (preg->fastmap);
	522	preg->fastmap = NULL;
	523	}
	524
	525	return (int) ret;
	526	}
	527	#ifdef _LIBC
	528	weak_alias (__regcomp, regcomp)
	529	#endif
	530
	531	/* Returns a message corresponding to an error code, ERRCODE, returned
	532	from either regcomp or regexec. We don't use PREG here. */
	533
	534	#ifdef _LIBC
	535	size_t
	536	regerror (errcode, preg, errbuf, errbuf_size)
	537	int errcode;
	538	const regex_t *_Restrict_ preg;
	539	char *_Restrict_ errbuf;
	540	size_t errbuf_size;
	541	#else /* size_t might promote */
	542	size_t
	543	regerror (int errcode, const regex_t *_Restrict_ preg,
	544	char *_Restrict_ errbuf, size_t errbuf_size)
	545	#endif
	546	{
	547	const char *msg;
	548	size_t msg_size;
	549
	550	if (BE (errcode < 0
	551	\|\| errcode >= (int) (sizeof (__re_error_msgid_idx)
	552	/ sizeof (__re_error_msgid_idx[0])), 0))
	553	/* Only error codes returned by the rest of the code should be passed
	554	to this routine. If we are given anything else, or if other regex
	555	code generates an invalid error code, then the program has a bug.
	556	Dump core so we can fix it. */
	557	abort ();
	558
	559	msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
	560
	561	msg_size = strlen (msg) + 1; /* Includes the null. */
	562
	563	if (BE (errbuf_size != 0, 1))
	564	{
	565	size_t cpy_size = msg_size;
	566	if (BE (msg_size > errbuf_size, 0))
	567	{
	568	cpy_size = errbuf_size - 1;
	569	errbuf[cpy_size] = '\0';
	570	}
	571	memcpy (errbuf, msg, cpy_size);
	572	}
	573
	574	return msg_size;
	575	}
	576	#ifdef _LIBC
	577	weak_alias (__regerror, regerror)
	578	#endif
	579
	580
	581	#ifdef RE_ENABLE_I18N
	582	/* This static array is used for the map to single-byte characters when
	583	UTF-8 is used. Otherwise we would allocate memory just to initialize
	584	it the same all the time. UTF-8 is the preferred encoding so this is
	585	a worthwhile optimization. */
	586	static const bitset_t utf8_sb_map =
	587	{
	588	/* Set the first 128 bits. */
	589	# ifdef __GNUC__
	590	[0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
	591	# else
	592	# if 4 * BITSET_WORD_BITS < ASCII_CHARS
	593	# error "bitset_word_t is narrower than 32 bits"
	594	# elif 3 * BITSET_WORD_BITS < ASCII_CHARS
	595	BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
	596	# elif 2 * BITSET_WORD_BITS < ASCII_CHARS
	597	BITSET_WORD_MAX, BITSET_WORD_MAX,
	598	# elif 1 * BITSET_WORD_BITS < ASCII_CHARS
	599	BITSET_WORD_MAX,
	600	# endif
	601	(BITSET_WORD_MAX
	602	>> (SBC_MAX % BITSET_WORD_BITS == 0
	603	? 0
	604	: BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
	605	# endif
	606	};
	607	#endif
	608
	609
	610	static void
	611	free_dfa_content (re_dfa_t *dfa)
	612	{
	613	Idx i, j;
	614
	615	if (dfa->nodes)
	616	for (i = 0; i < dfa->nodes_len; ++i)
	617	free_token (dfa->nodes + i);
	618	re_free (dfa->nexts);
	619	for (i = 0; i < dfa->nodes_len; ++i)
	620	{
	621	if (dfa->eclosures != NULL)
	622	re_node_set_free (dfa->eclosures + i);
	623	if (dfa->inveclosures != NULL)
	624	re_node_set_free (dfa->inveclosures + i);
	625	if (dfa->edests != NULL)
	626	re_node_set_free (dfa->edests + i);
	627	}
	628	re_free (dfa->edests);
	629	re_free (dfa->eclosures);
	630	re_free (dfa->inveclosures);
	631	re_free (dfa->nodes);
	632
	633	if (dfa->state_table)
	634	for (i = 0; i <= dfa->state_hash_mask; ++i)
	635	{
	636	struct re_state_table_entry *entry = dfa->state_table + i;
	637	for (j = 0; j < entry->num; ++j)
	638	{
	639	re_dfastate_t *state = entry->array[j];
	640	free_state (state);
	641	}
	642	re_free (entry->array);
	643	}
	644	re_free (dfa->state_table);
	645	#ifdef RE_ENABLE_I18N
	646	if (dfa->sb_char != utf8_sb_map)
	647	re_free (dfa->sb_char);
	648	#endif
	649	re_free (dfa->subexp_map);
	650	#ifdef DEBUG
	651	re_free (dfa->re_str);
	652	#endif
	653
	654	re_free (dfa);
	655	}
	656
	657
	658	/* Free dynamically allocated space used by PREG. */
	659
	660	void
	661	regfree (preg)
	662	regex_t *preg;
	663	{
	664	re_dfa_t *dfa = preg->buffer;
	665	if (BE (dfa != NULL, 1))
	666	free_dfa_content (dfa);
	667	preg->buffer = NULL;
	668	preg->allocated = 0;
	669
	670	re_free (preg->fastmap);
	671	preg->fastmap = NULL;
	672
	673	re_free (preg->translate);
	674	preg->translate = NULL;
	675	}
	676	#ifdef _LIBC
	677	weak_alias (__regfree, regfree)
	678	#endif
	679	\f
	680	/* Entry points compatible with 4.2 BSD regex library. We don't define
	681	them unless specifically requested. */
	682
	683	#if defined _REGEX_RE_COMP \|\| defined _LIBC
	684
	685	/* BSD has one and only one pattern buffer. */
	686	static struct re_pattern_buffer re_comp_buf;
	687
	688	char *
	689	# ifdef _LIBC
	690	/* Make these definitions weak in libc, so POSIX programs can redefine
	691	these names if they don't use our functions, and still use
	692	regcomp/regexec above without link errors. */
	693	weak_function
	694	# endif
	695	re_comp (s)
	696	const char *s;
	697	{
	698	reg_errcode_t ret;
	699	char *fastmap;
	700
	701	if (!s)
	702	{
	703	if (!re_comp_buf.buffer)
	704	return gettext ("No previous regular expression");
	705	return 0;
	706	}
	707
	708	if (re_comp_buf.buffer)
	709	{
	710	fastmap = re_comp_buf.fastmap;
	711	re_comp_buf.fastmap = NULL;
	712	__regfree (&re_comp_buf);
	713	memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
	714	re_comp_buf.fastmap = fastmap;
	715	}
	716
	717	if (re_comp_buf.fastmap == NULL)
	718	{
	719	re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
	720	if (re_comp_buf.fastmap == NULL)
	721	return (char *) gettext (__re_error_msgid
	722	+ __re_error_msgid_idx[(int) REG_ESPACE]);
	723	}
	724
	725	/* Since 're_exec' always passes NULL for the 'regs' argument, we
	726	don't need to initialize the pattern buffer fields which affect it. */
	727
	728	/* Match anchors at newlines. */
	729	re_comp_buf.newline_anchor = 1;
	730
	731	ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
	732
	733	if (!ret)
	734	return NULL;
	735
	736	/* Yes, we're discarding 'const' here if !HAVE_LIBINTL. */
	737	return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
	738	}
	739
	740	#ifdef _LIBC
	741	libc_freeres_fn (free_mem)
	742	{
	743	__regfree (&re_comp_buf);
	744	}
	745	#endif
	746
	747	#endif /* _REGEX_RE_COMP */
	748	\f
	749	/* Internal entry point.
	750	Compile the regular expression PATTERN, whose length is LENGTH.
	751	SYNTAX indicate regular expression's syntax. */
	752
	753	static reg_errcode_t
	754	re_compile_internal (regex_t preg, const char pattern, size_t length,
	755	reg_syntax_t syntax)
	756	{
	757	reg_errcode_t err = REG_NOERROR;
	758	re_dfa_t *dfa;
	759	re_string_t regexp;
	760
	761	/* Initialize the pattern buffer. */
	762	preg->fastmap_accurate = 0;
	763	preg->syntax = syntax;
	764	preg->not_bol = preg->not_eol = 0;
	765	preg->used = 0;
	766	preg->re_nsub = 0;
	767	preg->can_be_null = 0;
	768	preg->regs_allocated = REGS_UNALLOCATED;
	769
	770	/* Initialize the dfa. */
	771	dfa = preg->buffer;
	772	if (BE (preg->allocated < sizeof (re_dfa_t), 0))
	773	{
	774	/* If zero allocated, but buffer is non-null, try to realloc
	775	enough space. This loses if buffer's address is bogus, but
	776	that is the user's responsibility. If ->buffer is NULL this
	777	is a simple allocation. */
	778	dfa = re_realloc (preg->buffer, re_dfa_t, 1);
	779	if (dfa == NULL)
	780	return REG_ESPACE;
	781	preg->allocated = sizeof (re_dfa_t);
	782	preg->buffer = dfa;
	783	}
	784	preg->used = sizeof (re_dfa_t);
	785
	786	err = init_dfa (dfa, length);
	787	if (BE (err != REG_NOERROR, 0))
	788	{
	789	free_dfa_content (dfa);
	790	preg->buffer = NULL;
	791	preg->allocated = 0;
	792	return err;
	793	}
	794	#ifdef DEBUG
	795	/* Note: length+1 will not overflow since it is checked in init_dfa. */
	796	dfa->re_str = re_malloc (char, length + 1);
	797	strncpy (dfa->re_str, pattern, length + 1);
	798	#endif
	799
	800	__libc_lock_init (dfa->lock);
	801
	802	err = re_string_construct (&regexp, pattern, length, preg->translate,
	803	(syntax & RE_ICASE) != 0, dfa);
	804	if (BE (err != REG_NOERROR, 0))
	805	{
	806	re_compile_internal_free_return:
	807	free_workarea_compile (preg);
	808	re_string_destruct (&regexp);
	809	free_dfa_content (dfa);
	810	preg->buffer = NULL;
	811	preg->allocated = 0;
	812	return err;
	813	}
	814
	815	/* Parse the regular expression, and build a structure tree. */
	816	preg->re_nsub = 0;
	817	dfa->str_tree = parse (&regexp, preg, syntax, &err);
	818	if (BE (dfa->str_tree == NULL, 0))
	819	goto re_compile_internal_free_return;
	820
	821	/* Analyze the tree and create the nfa. */
	822	err = analyze (preg);
	823	if (BE (err != REG_NOERROR, 0))
	824	goto re_compile_internal_free_return;
	825
	826	#ifdef RE_ENABLE_I18N
	827	/* If possible, do searching in single byte encoding to speed things up. */
	828	if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
	829	optimize_utf8 (dfa);
	830	#endif
	831
	832	/* Then create the initial state of the dfa. */
	833	err = create_initial_state (dfa);
	834
	835	/* Release work areas. */
	836	free_workarea_compile (preg);
	837	re_string_destruct (&regexp);
	838
	839	if (BE (err != REG_NOERROR, 0))
	840	{
	841	free_dfa_content (dfa);
	842	preg->buffer = NULL;
	843	preg->allocated = 0;
	844	}
	845
	846	return err;
	847	}
	848
	849	/* Initialize DFA. We use the length of the regular expression PAT_LEN
	850	as the initial length of some arrays. */
	851
	852	static reg_errcode_t
	853	init_dfa (re_dfa_t *dfa, size_t pat_len)
	854	{
	855	__re_size_t table_size;
	856	#ifndef _LIBC
	857	const char *codeset_name;
	858	#endif
	859	#ifdef RE_ENABLE_I18N
	860	size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
	861	#else
	862	size_t max_i18n_object_size = 0;
	863	#endif
	864	size_t max_object_size =
	865	MAX (sizeof (struct re_state_table_entry),
	866	MAX (sizeof (re_token_t),
	867	MAX (sizeof (re_node_set),
	868	MAX (sizeof (regmatch_t),
	869	max_i18n_object_size))));
	870
	871	memset (dfa, '\0', sizeof (re_dfa_t));
	872
	873	/* Force allocation of str_tree_storage the first time. */
	874	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
	875
	876	/* Avoid overflows. The extra "/ 2" is for the table_size doubling
	877	calculation below, and for similar doubling calculations
	878	elsewhere. And it's <= rather than <, because some of the
	879	doubling calculations add 1 afterwards. */
	880	if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) / 2 <= pat_len, 0))
	881	return REG_ESPACE;
	882
	883	dfa->nodes_alloc = pat_len + 1;
	884	dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
	885
	886	/* table_size = 2 ^ ceil(log pat_len) */
	887	for (table_size = 1; ; table_size <<= 1)
	888	if (table_size > pat_len)
	889	break;
	890
	891	dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
	892	dfa->state_hash_mask = table_size - 1;
	893
	894	dfa->mb_cur_max = MB_CUR_MAX;
	895	#ifdef _LIBC
	896	if (dfa->mb_cur_max == 6
	897	&& strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
	898	dfa->is_utf8 = 1;
	899	dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
	900	!= 0);
	901	#else
	902	codeset_name = nl_langinfo (CODESET);
	903	if ((codeset_name[0] == 'U' \|\| codeset_name[0] == 'u')
	904	&& (codeset_name[1] == 'T' \|\| codeset_name[1] == 't')
	905	&& (codeset_name[2] == 'F' \|\| codeset_name[2] == 'f')
	906	&& strcmp (codeset_name + 3 + (codeset_name[3] == '-'), "8") == 0)
	907	dfa->is_utf8 = 1;
	908
	909	/* We check exhaustively in the loop below if this charset is a
	910	superset of ASCII. */
	911	dfa->map_notascii = 0;
	912	#endif
	913
	914	#ifdef RE_ENABLE_I18N
	915	if (dfa->mb_cur_max > 1)
	916	{
	917	if (dfa->is_utf8)
	918	dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
	919	else
	920	{
	921	int i, j, ch;
	922
	923	dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
	924	if (BE (dfa->sb_char == NULL, 0))
	925	return REG_ESPACE;
	926
	927	/* Set the bits corresponding to single byte chars. */
	928	for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
	929	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
	930	{
	931	wint_t wch = __btowc (ch);
	932	if (wch != WEOF)
	933	dfa->sb_char[i] \|= (bitset_word_t) 1 << j;
	934	# ifndef _LIBC
	935	if (isascii (ch) && wch != ch)
	936	dfa->map_notascii = 1;
	937	# endif
	938	}
	939	}
	940	}
	941	#endif
	942
	943	if (BE (dfa->nodes == NULL \|\| dfa->state_table == NULL, 0))
	944	return REG_ESPACE;
	945	return REG_NOERROR;
	946	}
	947
	948	/* Initialize WORD_CHAR table, which indicate which character is
	949	"word". In this case "word" means that it is the word construction
	950	character used by some operators like "\<", "\>", etc. */
	951
	952	static void
	953	internal_function
	954	init_word_char (re_dfa_t *dfa)
	955	{
	956	int i = 0;
	957	int j;
	958	int ch = 0;
	959	dfa->word_ops_used = 1;
	960	if (BE (dfa->map_notascii == 0, 1))
	961	{
	962	bitset_word_t bits0 = 0x00000000;
	963	bitset_word_t bits1 = 0x03ff0000;
	964	bitset_word_t bits2 = 0x87fffffe;
	965	bitset_word_t bits3 = 0x07fffffe;
	966	if (BITSET_WORD_BITS == 64)
	967	{
	968	dfa->word_char[0] = bits1 << 31 << 1 \| bits0;
	969	dfa->word_char[1] = bits3 << 31 << 1 \| bits2;
	970	i = 2;
	971	}
	972	else if (BITSET_WORD_BITS == 32)
	973	{
	974	dfa->word_char[0] = bits0;
	975	dfa->word_char[1] = bits1;
	976	dfa->word_char[2] = bits2;
	977	dfa->word_char[3] = bits3;
	978	i = 4;
	979	}
	980	else
	981	goto general_case;
	982	ch = 128;
	983
	984	if (BE (dfa->is_utf8, 1))
	985	{
	986	memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
	987	return;
	988	}
	989	}
	990
	991	general_case:
	992	for (; i < BITSET_WORDS; ++i)
	993	for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
	994	if (isalnum (ch) \|\| ch == '_')
	995	dfa->word_char[i] \|= (bitset_word_t) 1 << j;
	996	}
	997
	998	/* Free the work area which are only used while compiling. */
	999
	1000	static void
	1001	free_workarea_compile (regex_t *preg)
	1002	{
	1003	re_dfa_t *dfa = preg->buffer;
	1004	bin_tree_storage_t storage, next;
	1005	for (storage = dfa->str_tree_storage; storage; storage = next)
	1006	{
	1007	next = storage->next;
	1008	re_free (storage);
	1009	}
	1010	dfa->str_tree_storage = NULL;
	1011	dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
	1012	dfa->str_tree = NULL;
	1013	re_free (dfa->org_indices);
	1014	dfa->org_indices = NULL;
	1015	}
	1016
	1017	/* Create initial states for all contexts. */
	1018
	1019	static reg_errcode_t
	1020	create_initial_state (re_dfa_t *dfa)
	1021	{
	1022	Idx first, i;
	1023	reg_errcode_t err;
	1024	re_node_set init_nodes;
	1025
	1026	/* Initial states have the epsilon closure of the node which is
	1027	the first node of the regular expression. */
	1028	first = dfa->str_tree->first->node_idx;
	1029	dfa->init_node = first;
	1030	err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
	1031	if (BE (err != REG_NOERROR, 0))
	1032	return err;
	1033
	1034	/* The back-references which are in initial states can epsilon transit,
	1035	since in this case all of the subexpressions can be null.
	1036	Then we add epsilon closures of the nodes which are the next nodes of
	1037	the back-references. */
	1038	if (dfa->nbackref > 0)
	1039	for (i = 0; i < init_nodes.nelem; ++i)
	1040	{
	1041	Idx node_idx = init_nodes.elems[i];
	1042	re_token_type_t type = dfa->nodes[node_idx].type;
	1043
	1044	Idx clexp_idx;
	1045	if (type != OP_BACK_REF)
	1046	continue;
	1047	for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
	1048	{
	1049	re_token_t *clexp_node;
	1050	clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
	1051	if (clexp_node->type == OP_CLOSE_SUBEXP
	1052	&& clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
	1053	break;
	1054	}
	1055	if (clexp_idx == init_nodes.nelem)
	1056	continue;
	1057
	1058	if (type == OP_BACK_REF)
	1059	{
	1060	Idx dest_idx = dfa->edests[node_idx].elems[0];
	1061	if (!re_node_set_contains (&init_nodes, dest_idx))
	1062	{
	1063	reg_errcode_t merge_err
	1064	= re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
	1065	if (merge_err != REG_NOERROR)
	1066	return merge_err;
	1067	i = 0;
	1068	}
	1069	}
	1070	}
	1071
	1072	/* It must be the first time to invoke acquire_state. */
	1073	dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
	1074	/* We don't check ERR here, since the initial state must not be NULL. */
	1075	if (BE (dfa->init_state == NULL, 0))
	1076	return err;
	1077	if (dfa->init_state->has_constraint)
	1078	{
	1079	dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
	1080	CONTEXT_WORD);
	1081	dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
	1082	CONTEXT_NEWLINE);
	1083	dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
	1084	&init_nodes,
	1085	CONTEXT_NEWLINE
	1086	\| CONTEXT_BEGBUF);
	1087	if (BE (dfa->init_state_word == NULL \|\| dfa->init_state_nl == NULL
	1088	\|\| dfa->init_state_begbuf == NULL, 0))
	1089	return err;
	1090	}
	1091	else
	1092	dfa->init_state_word = dfa->init_state_nl
	1093	= dfa->init_state_begbuf = dfa->init_state;
	1094
	1095	re_node_set_free (&init_nodes);
	1096	return REG_NOERROR;
	1097	}
	1098	\f
	1099	#ifdef RE_ENABLE_I18N
	1100	/* If it is possible to do searching in single byte encoding instead of UTF-8
	1101	to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
	1102	DFA nodes where needed. */
	1103
	1104	static void
	1105	optimize_utf8 (re_dfa_t *dfa)
	1106	{
	1107	Idx node;
	1108	int i;
	1109	bool mb_chars = false;
	1110	bool has_period = false;
	1111
	1112	for (node = 0; node < dfa->nodes_len; ++node)
	1113	switch (dfa->nodes[node].type)
	1114	{
	1115	case CHARACTER:
	1116	if (dfa->nodes[node].opr.c >= ASCII_CHARS)
	1117	mb_chars = true;
	1118	break;
	1119	case ANCHOR:
	1120	switch (dfa->nodes[node].opr.ctx_type)
	1121	{
	1122	case LINE_FIRST:
	1123	case LINE_LAST:
	1124	case BUF_FIRST:
	1125	case BUF_LAST:
	1126	break;
	1127	default:
	1128	/* Word anchors etc. cannot be handled. It's okay to test
	1129	opr.ctx_type since constraints (for all DFA nodes) are
	1130	created by ORing one or more opr.ctx_type values. */
	1131	return;
	1132	}
	1133	break;
	1134	case OP_PERIOD:
	1135	has_period = true;
	1136	break;
	1137	case OP_BACK_REF:
	1138	case OP_ALT:
	1139	case END_OF_RE:
	1140	case OP_DUP_ASTERISK:
	1141	case OP_OPEN_SUBEXP:
	1142	case OP_CLOSE_SUBEXP:
	1143	break;
	1144	case COMPLEX_BRACKET:
	1145	return;
	1146	case SIMPLE_BRACKET:
	1147	/* Just double check. */
	1148	{
	1149	int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
	1150	? 0
	1151	: BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
	1152	for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
	1153	{
	1154	if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
	1155	return;
	1156	rshift = 0;
	1157	}
	1158	}
	1159	break;
	1160	default:
	1161	abort ();
	1162	}
	1163
	1164	if (mb_chars \|\| has_period)
	1165	for (node = 0; node < dfa->nodes_len; ++node)
	1166	{
	1167	if (dfa->nodes[node].type == CHARACTER
	1168	&& dfa->nodes[node].opr.c >= ASCII_CHARS)
	1169	dfa->nodes[node].mb_partial = 0;
	1170	else if (dfa->nodes[node].type == OP_PERIOD)
	1171	dfa->nodes[node].type = OP_UTF8_PERIOD;
	1172	}
	1173
	1174	/* The search can be in single byte locale. */
	1175	dfa->mb_cur_max = 1;
	1176	dfa->is_utf8 = 0;
	1177	dfa->has_mb_node = dfa->nbackref > 0 \|\| has_period;
	1178	}
	1179	#endif
	1180	\f
	1181	/* Analyze the structure tree, and calculate "first", "next", "edest",
	1182	"eclosure", and "inveclosure". */
	1183
	1184	static reg_errcode_t
	1185	analyze (regex_t *preg)
	1186	{
	1187	re_dfa_t *dfa = preg->buffer;
	1188	reg_errcode_t ret;
	1189
	1190	/* Allocate arrays. */
	1191	dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
	1192	dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
	1193	dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
	1194	dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
	1195	if (BE (dfa->nexts == NULL \|\| dfa->org_indices == NULL \|\| dfa->edests == NULL
	1196	\|\| dfa->eclosures == NULL, 0))
	1197	return REG_ESPACE;
	1198
	1199	dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
	1200	if (dfa->subexp_map != NULL)
	1201	{
	1202	Idx i;
	1203	for (i = 0; i < preg->re_nsub; i++)
	1204	dfa->subexp_map[i] = i;
	1205	preorder (dfa->str_tree, optimize_subexps, dfa);
	1206	for (i = 0; i < preg->re_nsub; i++)
	1207	if (dfa->subexp_map[i] != i)
	1208	break;
	1209	if (i == preg->re_nsub)
	1210	{
	1211	free (dfa->subexp_map);
	1212	dfa->subexp_map = NULL;
	1213	}
	1214	}
	1215
	1216	ret = postorder (dfa->str_tree, lower_subexps, preg);
	1217	if (BE (ret != REG_NOERROR, 0))
	1218	return ret;
	1219	ret = postorder (dfa->str_tree, calc_first, dfa);
	1220	if (BE (ret != REG_NOERROR, 0))
	1221	return ret;
	1222	preorder (dfa->str_tree, calc_next, dfa);
	1223	ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
	1224	if (BE (ret != REG_NOERROR, 0))
	1225	return ret;
	1226	ret = calc_eclosure (dfa);
	1227	if (BE (ret != REG_NOERROR, 0))
	1228	return ret;
	1229
	1230	/* We only need this during the prune_impossible_nodes pass in regexec.c;
	1231	skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
	1232	if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
	1233	\|\| dfa->nbackref)
	1234	{
	1235	dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
	1236	if (BE (dfa->inveclosures == NULL, 0))
	1237	return REG_ESPACE;
	1238	ret = calc_inveclosure (dfa);
	1239	}
	1240
	1241	return ret;
	1242	}
	1243
	1244	/* Our parse trees are very unbalanced, so we cannot use a stack to
	1245	implement parse tree visits. Instead, we use parent pointers and
	1246	some hairy code in these two functions. */
	1247	static reg_errcode_t
	1248	postorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
	1249	void *extra)
	1250	{
	1251	bin_tree_t node, prev;
	1252
	1253	for (node = root; ; )
	1254	{
	1255	/* Descend down the tree, preferably to the left (or to the right
	1256	if that's the only child). */
	1257	while (node->left \|\| node->right)
	1258	if (node->left)
	1259	node = node->left;
	1260	else
	1261	node = node->right;
	1262
	1263	do
	1264	{
	1265	reg_errcode_t err = fn (extra, node);
	1266	if (BE (err != REG_NOERROR, 0))
	1267	return err;
	1268	if (node->parent == NULL)
	1269	return REG_NOERROR;
	1270	prev = node;
	1271	node = node->parent;
	1272	}
	1273	/* Go up while we have a node that is reached from the right. */
	1274	while (node->right == prev \|\| node->right == NULL);
	1275	node = node->right;
	1276	}
	1277	}
	1278
	1279	static reg_errcode_t
	1280	preorder (bin_tree_t root, reg_errcode_t (fn (void , bin_tree_t *)),
	1281	void *extra)
	1282	{
	1283	bin_tree_t *node;
	1284
	1285	for (node = root; ; )
	1286	{
	1287	reg_errcode_t err = fn (extra, node);
	1288	if (BE (err != REG_NOERROR, 0))
	1289	return err;
	1290
	1291	/* Go to the left node, or up and to the right. */
	1292	if (node->left)
	1293	node = node->left;
	1294	else
	1295	{
	1296	bin_tree_t *prev = NULL;
	1297	while (node->right == prev \|\| node->right == NULL)
	1298	{
	1299	prev = node;
	1300	node = node->parent;
	1301	if (!node)
	1302	return REG_NOERROR;
	1303	}
	1304	node = node->right;
	1305	}
	1306	}
	1307	}
	1308
	1309	/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
	1310	re_search_internal to map the inner one's opr.idx to this one's. Adjust
	1311	backreferences as well. Requires a preorder visit. */
	1312	static reg_errcode_t
	1313	optimize_subexps (void extra, bin_tree_t node)
	1314	{
	1315	re_dfa_t dfa = (re_dfa_t ) extra;
	1316
	1317	if (node->token.type == OP_BACK_REF && dfa->subexp_map)
	1318	{
	1319	int idx = node->token.opr.idx;
	1320	node->token.opr.idx = dfa->subexp_map[idx];
	1321	dfa->used_bkref_map \|= 1 << node->token.opr.idx;
	1322	}
	1323
	1324	else if (node->token.type == SUBEXP
	1325	&& node->left && node->left->token.type == SUBEXP)
	1326	{
	1327	Idx other_idx = node->left->token.opr.idx;
	1328
	1329	node->left = node->left->left;
	1330	if (node->left)
	1331	node->left->parent = node;
	1332
	1333	dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
	1334	if (other_idx < BITSET_WORD_BITS)
	1335	dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
	1336	}
	1337
	1338	return REG_NOERROR;
	1339	}
	1340
	1341	/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
	1342	of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
	1343	static reg_errcode_t
	1344	lower_subexps (void extra, bin_tree_t node)
	1345	{
	1346	regex_t preg = (regex_t ) extra;
	1347	reg_errcode_t err = REG_NOERROR;
	1348
	1349	if (node->left && node->left->token.type == SUBEXP)
	1350	{
	1351	node->left = lower_subexp (&err, preg, node->left);
	1352	if (node->left)
	1353	node->left->parent = node;
	1354	}
	1355	if (node->right && node->right->token.type == SUBEXP)
	1356	{
	1357	node->right = lower_subexp (&err, preg, node->right);
	1358	if (node->right)
	1359	node->right->parent = node;
	1360	}
	1361
	1362	return err;
	1363	}
	1364
	1365	static bin_tree_t *
	1366	lower_subexp (reg_errcode_t err, regex_t preg, bin_tree_t *node)
	1367	{
	1368	re_dfa_t *dfa = preg->buffer;
	1369	bin_tree_t *body = node->left;
	1370	bin_tree_t op, cls, tree1, tree;
	1371
	1372	if (preg->no_sub
	1373	/* We do not optimize empty subexpressions, because otherwise we may
	1374	have bad CONCAT nodes with NULL children. This is obviously not
	1375	very common, so we do not lose much. An example that triggers
	1376	this case is the sed "script" //x. */
	1377	&& node->left != NULL
	1378	&& (node->token.opr.idx >= BITSET_WORD_BITS
	1379	\|\| !(dfa->used_bkref_map
	1380	& ((bitset_word_t) 1 << node->token.opr.idx))))
	1381	return node->left;
	1382
	1383	/* Convert the SUBEXP node to the concatenation of an
	1384	OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
	1385	op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
	1386	cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
	1387	tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
	1388	tree = create_tree (dfa, op, tree1, CONCAT);
	1389	if (BE (tree == NULL \|\| tree1 == NULL \|\| op == NULL \|\| cls == NULL, 0))
	1390	{
	1391	*err = REG_ESPACE;
	1392	return NULL;
	1393	}
	1394
	1395	op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
	1396	op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
	1397	return tree;
	1398	}
	1399
	1400	/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
	1401	nodes. Requires a postorder visit. */
	1402	static reg_errcode_t
	1403	calc_first (void extra, bin_tree_t node)
	1404	{
	1405	re_dfa_t dfa = (re_dfa_t ) extra;
	1406	if (node->token.type == CONCAT)
	1407	{
	1408	node->first = node->left->first;
	1409	node->node_idx = node->left->node_idx;
	1410	}
	1411	else
	1412	{
	1413	node->first = node;
	1414	node->node_idx = re_dfa_add_node (dfa, node->token);
	1415	if (BE (node->node_idx == REG_MISSING, 0))
	1416	return REG_ESPACE;
	1417	if (node->token.type == ANCHOR)
	1418	dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
	1419	}
	1420	return REG_NOERROR;
	1421	}
	1422
	1423	/* Pass 2: compute NEXT on the tree. Preorder visit. */
	1424	static reg_errcode_t
	1425	calc_next (void extra, bin_tree_t node)
	1426	{
	1427	switch (node->token.type)
	1428	{
	1429	case OP_DUP_ASTERISK:
	1430	node->left->next = node;
	1431	break;
	1432	case CONCAT:
	1433	node->left->next = node->right->first;
	1434	node->right->next = node->next;
	1435	break;
	1436	default:
	1437	if (node->left)
	1438	node->left->next = node->next;
	1439	if (node->right)
	1440	node->right->next = node->next;
	1441	break;
	1442	}
	1443	return REG_NOERROR;
	1444	}
	1445
	1446	/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
	1447	static reg_errcode_t
	1448	link_nfa_nodes (void extra, bin_tree_t node)
	1449	{
	1450	re_dfa_t dfa = (re_dfa_t ) extra;
	1451	Idx idx = node->node_idx;
	1452	reg_errcode_t err = REG_NOERROR;
	1453
	1454	switch (node->token.type)
	1455	{
	1456	case CONCAT:
	1457	break;
	1458
	1459	case END_OF_RE:
	1460	assert (node->next == NULL);
	1461	break;
	1462
	1463	case OP_DUP_ASTERISK:
	1464	case OP_ALT:
	1465	{
	1466	Idx left, right;
	1467	dfa->has_plural_match = 1;
	1468	if (node->left != NULL)
	1469	left = node->left->first->node_idx;
	1470	else
	1471	left = node->next->node_idx;
	1472	if (node->right != NULL)
	1473	right = node->right->first->node_idx;
	1474	else
	1475	right = node->next->node_idx;
	1476	assert (REG_VALID_INDEX (left));
	1477	assert (REG_VALID_INDEX (right));
	1478	err = re_node_set_init_2 (dfa->edests + idx, left, right);
	1479	}
	1480	break;
	1481
	1482	case ANCHOR:
	1483	case OP_OPEN_SUBEXP:
	1484	case OP_CLOSE_SUBEXP:
	1485	err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
	1486	break;
	1487
	1488	case OP_BACK_REF:
	1489	dfa->nexts[idx] = node->next->node_idx;
	1490	if (node->token.type == OP_BACK_REF)
	1491	err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
	1492	break;
	1493
	1494	default:
	1495	assert (!IS_EPSILON_NODE (node->token.type));
	1496	dfa->nexts[idx] = node->next->node_idx;
	1497	break;
	1498	}
	1499
	1500	return err;
	1501	}
	1502
	1503	/* Duplicate the epsilon closure of the node ROOT_NODE.
	1504	Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
	1505	to their own constraint. */
	1506
	1507	static reg_errcode_t
	1508	internal_function
	1509	duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
	1510	Idx root_node, unsigned int init_constraint)
	1511	{
	1512	Idx org_node, clone_node;
	1513	bool ok;
	1514	unsigned int constraint = init_constraint;
	1515	for (org_node = top_org_node, clone_node = top_clone_node;;)
	1516	{
	1517	Idx org_dest, clone_dest;
	1518	if (dfa->nodes[org_node].type == OP_BACK_REF)
	1519	{
	1520	/* If the back reference epsilon-transit, its destination must
	1521	also have the constraint. Then duplicate the epsilon closure
	1522	of the destination of the back reference, and store it in
	1523	edests of the back reference. */
	1524	org_dest = dfa->nexts[org_node];
	1525	re_node_set_empty (dfa->edests + clone_node);
	1526	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1527	if (BE (clone_dest == REG_MISSING, 0))
	1528	return REG_ESPACE;
	1529	dfa->nexts[clone_node] = dfa->nexts[org_node];
	1530	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1531	if (BE (! ok, 0))
	1532	return REG_ESPACE;
	1533	}
	1534	else if (dfa->edests[org_node].nelem == 0)
	1535	{
	1536	/* In case of the node can't epsilon-transit, don't duplicate the
	1537	destination and store the original destination as the
	1538	destination of the node. */
	1539	dfa->nexts[clone_node] = dfa->nexts[org_node];
	1540	break;
	1541	}
	1542	else if (dfa->edests[org_node].nelem == 1)
	1543	{
	1544	/* In case of the node can epsilon-transit, and it has only one
	1545	destination. */
	1546	org_dest = dfa->edests[org_node].elems[0];
	1547	re_node_set_empty (dfa->edests + clone_node);
	1548	/* If the node is root_node itself, it means the epsilon closure
	1549	has a loop. Then tie it to the destination of the root_node. */
	1550	if (org_node == root_node && clone_node != org_node)
	1551	{
	1552	ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
	1553	if (BE (! ok, 0))
	1554	return REG_ESPACE;
	1555	break;
	1556	}
	1557	/* In case the node has another constraint, append it. */
	1558	constraint \|= dfa->nodes[org_node].constraint;
	1559	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1560	if (BE (clone_dest == REG_MISSING, 0))
	1561	return REG_ESPACE;
	1562	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1563	if (BE (! ok, 0))
	1564	return REG_ESPACE;
	1565	}
	1566	else /* dfa->edests[org_node].nelem == 2 */
	1567	{
	1568	/* In case of the node can epsilon-transit, and it has two
	1569	destinations. In the bin_tree_t and DFA, that's '\|' and ''. /
	1570	org_dest = dfa->edests[org_node].elems[0];
	1571	re_node_set_empty (dfa->edests + clone_node);
	1572	/* Search for a duplicated node which satisfies the constraint. */
	1573	clone_dest = search_duplicated_node (dfa, org_dest, constraint);
	1574	if (clone_dest == REG_MISSING)
	1575	{
	1576	/* There is no such duplicated node, create a new one. */
	1577	reg_errcode_t err;
	1578	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1579	if (BE (clone_dest == REG_MISSING, 0))
	1580	return REG_ESPACE;
	1581	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1582	if (BE (! ok, 0))
	1583	return REG_ESPACE;
	1584	err = duplicate_node_closure (dfa, org_dest, clone_dest,
	1585	root_node, constraint);
	1586	if (BE (err != REG_NOERROR, 0))
	1587	return err;
	1588	}
	1589	else
	1590	{
	1591	/* There is a duplicated node which satisfies the constraint,
	1592	use it to avoid infinite loop. */
	1593	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1594	if (BE (! ok, 0))
	1595	return REG_ESPACE;
	1596	}
	1597
	1598	org_dest = dfa->edests[org_node].elems[1];
	1599	clone_dest = duplicate_node (dfa, org_dest, constraint);
	1600	if (BE (clone_dest == REG_MISSING, 0))
	1601	return REG_ESPACE;
	1602	ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
	1603	if (BE (! ok, 0))
	1604	return REG_ESPACE;
	1605	}
	1606	org_node = org_dest;
	1607	clone_node = clone_dest;
	1608	}
	1609	return REG_NOERROR;
	1610	}
	1611
	1612	/* Search for a node which is duplicated from the node ORG_NODE, and
	1613	satisfies the constraint CONSTRAINT. */
	1614
	1615	static Idx
	1616	search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
	1617	unsigned int constraint)
	1618	{
	1619	Idx idx;
	1620	for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
	1621	{
	1622	if (org_node == dfa->org_indices[idx]
	1623	&& constraint == dfa->nodes[idx].constraint)
	1624	return idx; /* Found. */
	1625	}
	1626	return REG_MISSING; /* Not found. */
	1627	}
	1628
	1629	/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
	1630	Return the index of the new node, or REG_MISSING if insufficient storage is
	1631	available. */
	1632
	1633	static Idx
	1634	duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
	1635	{
	1636	Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
	1637	if (BE (dup_idx != REG_MISSING, 1))
	1638	{
	1639	dfa->nodes[dup_idx].constraint = constraint;
	1640	dfa->nodes[dup_idx].constraint \|= dfa->nodes[org_idx].constraint;
	1641	dfa->nodes[dup_idx].duplicated = 1;
	1642
	1643	/* Store the index of the original node. */
	1644	dfa->org_indices[dup_idx] = org_idx;
	1645	}
	1646	return dup_idx;
	1647	}
	1648
	1649	static reg_errcode_t
	1650	calc_inveclosure (re_dfa_t *dfa)
	1651	{
	1652	Idx src, idx;
	1653	bool ok;
	1654	for (idx = 0; idx < dfa->nodes_len; ++idx)
	1655	re_node_set_init_empty (dfa->inveclosures + idx);
	1656
	1657	for (src = 0; src < dfa->nodes_len; ++src)
	1658	{
	1659	Idx *elems = dfa->eclosures[src].elems;
	1660	for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
	1661	{
	1662	ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
	1663	if (BE (! ok, 0))
	1664	return REG_ESPACE;
	1665	}
	1666	}
	1667
	1668	return REG_NOERROR;
	1669	}
	1670
	1671	/* Calculate "eclosure" for all the node in DFA. */
	1672
	1673	static reg_errcode_t
	1674	calc_eclosure (re_dfa_t *dfa)
	1675	{
	1676	Idx node_idx;
	1677	bool incomplete;
	1678	#ifdef DEBUG
	1679	assert (dfa->nodes_len > 0);
	1680	#endif
	1681	incomplete = false;
	1682	/* For each nodes, calculate epsilon closure. */
	1683	for (node_idx = 0; ; ++node_idx)
	1684	{
	1685	reg_errcode_t err;
	1686	re_node_set eclosure_elem;
	1687	if (node_idx == dfa->nodes_len)
	1688	{
	1689	if (!incomplete)
	1690	break;
	1691	incomplete = false;
	1692	node_idx = 0;
	1693	}
	1694
	1695	#ifdef DEBUG
	1696	assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
	1697	#endif
	1698
	1699	/* If we have already calculated, skip it. */
	1700	if (dfa->eclosures[node_idx].nelem != 0)
	1701	continue;
	1702	/* Calculate epsilon closure of 'node_idx'. */
	1703	err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
	1704	if (BE (err != REG_NOERROR, 0))
	1705	return err;
	1706
	1707	if (dfa->eclosures[node_idx].nelem == 0)
	1708	{
	1709	incomplete = true;
	1710	re_node_set_free (&eclosure_elem);
	1711	}
	1712	}
	1713	return REG_NOERROR;
	1714	}
	1715
	1716	/* Calculate epsilon closure of NODE. */
	1717
	1718	static reg_errcode_t
	1719	calc_eclosure_iter (re_node_set new_set, re_dfa_t dfa, Idx node, bool root)
	1720	{
	1721	reg_errcode_t err;
	1722	Idx i;
	1723	re_node_set eclosure;
	1724	bool ok;
	1725	bool incomplete = false;
	1726	err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
	1727	if (BE (err != REG_NOERROR, 0))
	1728	return err;
	1729
	1730	/* This indicates that we are calculating this node now.
	1731	We reference this value to avoid infinite loop. */
	1732	dfa->eclosures[node].nelem = REG_MISSING;
	1733
	1734	/* If the current node has constraints, duplicate all nodes
	1735	since they must inherit the constraints. */
	1736	if (dfa->nodes[node].constraint
	1737	&& dfa->edests[node].nelem
	1738	&& !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
	1739	{
	1740	err = duplicate_node_closure (dfa, node, node, node,
	1741	dfa->nodes[node].constraint);
	1742	if (BE (err != REG_NOERROR, 0))
	1743	return err;
	1744	}
	1745
	1746	/* Expand each epsilon destination nodes. */
	1747	if (IS_EPSILON_NODE(dfa->nodes[node].type))
	1748	for (i = 0; i < dfa->edests[node].nelem; ++i)
	1749	{
	1750	re_node_set eclosure_elem;
	1751	Idx edest = dfa->edests[node].elems[i];
	1752	/* If calculating the epsilon closure of 'edest' is in progress,
	1753	return intermediate result. */
	1754	if (dfa->eclosures[edest].nelem == REG_MISSING)
	1755	{
	1756	incomplete = true;
	1757	continue;
	1758	}
	1759	/* If we haven't calculated the epsilon closure of 'edest' yet,
	1760	calculate now. Otherwise use calculated epsilon closure. */
	1761	if (dfa->eclosures[edest].nelem == 0)
	1762	{
	1763	err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
	1764	if (BE (err != REG_NOERROR, 0))
	1765	return err;
	1766	}
	1767	else
	1768	eclosure_elem = dfa->eclosures[edest];
	1769	/* Merge the epsilon closure of 'edest'. */
	1770	err = re_node_set_merge (&eclosure, &eclosure_elem);
	1771	if (BE (err != REG_NOERROR, 0))
	1772	return err;
	1773	/* If the epsilon closure of 'edest' is incomplete,
	1774	the epsilon closure of this node is also incomplete. */
	1775	if (dfa->eclosures[edest].nelem == 0)
	1776	{
	1777	incomplete = true;
	1778	re_node_set_free (&eclosure_elem);
	1779	}
	1780	}
	1781
	1782	/* An epsilon closure includes itself. */
	1783	ok = re_node_set_insert (&eclosure, node);
	1784	if (BE (! ok, 0))
	1785	return REG_ESPACE;
	1786	if (incomplete && !root)
	1787	dfa->eclosures[node].nelem = 0;
	1788	else
	1789	dfa->eclosures[node] = eclosure;
	1790	*new_set = eclosure;
	1791	return REG_NOERROR;
	1792	}
	1793	\f
	1794	/* Functions for token which are used in the parser. */
	1795
	1796	/* Fetch a token from INPUT.
	1797	We must not use this function inside bracket expressions. */
	1798
	1799	static void
	1800	internal_function
	1801	fetch_token (re_token_t result, re_string_t input, reg_syntax_t syntax)
	1802	{
	1803	re_string_skip_bytes (input, peek_token (result, input, syntax));
	1804	}
	1805
	1806	/* Peek a token from INPUT, and return the length of the token.
	1807	We must not use this function inside bracket expressions. */
	1808
	1809	static int
	1810	internal_function
	1811	peek_token (re_token_t token, re_string_t input, reg_syntax_t syntax)
	1812	{
	1813	unsigned char c;
	1814
	1815	if (re_string_eoi (input))
	1816	{
	1817	token->type = END_OF_RE;
	1818	return 0;
	1819	}
	1820
	1821	c = re_string_peek_byte (input, 0);
	1822	token->opr.c = c;
	1823
	1824	token->word_char = 0;
	1825	#ifdef RE_ENABLE_I18N
	1826	token->mb_partial = 0;
	1827	if (input->mb_cur_max > 1 &&
	1828	!re_string_first_byte (input, re_string_cur_idx (input)))
	1829	{
	1830	token->type = CHARACTER;
	1831	token->mb_partial = 1;
	1832	return 1;
	1833	}
	1834	#endif
	1835	if (c == '\\')
	1836	{
	1837	unsigned char c2;
	1838	if (re_string_cur_idx (input) + 1 >= re_string_length (input))
	1839	{
	1840	token->type = BACK_SLASH;
	1841	return 1;
	1842	}
	1843
	1844	c2 = re_string_peek_byte_case (input, 1);
	1845	token->opr.c = c2;
	1846	token->type = CHARACTER;
	1847	#ifdef RE_ENABLE_I18N
	1848	if (input->mb_cur_max > 1)
	1849	{
	1850	wint_t wc = re_string_wchar_at (input,
	1851	re_string_cur_idx (input) + 1);
	1852	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
	1853	}
	1854	else
	1855	#endif
	1856	token->word_char = IS_WORD_CHAR (c2) != 0;
	1857
	1858	switch (c2)
	1859	{
	1860	case '\|':
	1861	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
	1862	token->type = OP_ALT;
	1863	break;
	1864	case '1': case '2': case '3': case '4': case '5':
	1865	case '6': case '7': case '8': case '9':
	1866	if (!(syntax & RE_NO_BK_REFS))
	1867	{
	1868	token->type = OP_BACK_REF;
	1869	token->opr.idx = c2 - '1';
	1870	}
	1871	break;
	1872	case '<':
	1873	if (!(syntax & RE_NO_GNU_OPS))
	1874	{
	1875	token->type = ANCHOR;
	1876	token->opr.ctx_type = WORD_FIRST;
	1877	}
	1878	break;
	1879	case '>':
	1880	if (!(syntax & RE_NO_GNU_OPS))
	1881	{
	1882	token->type = ANCHOR;
	1883	token->opr.ctx_type = WORD_LAST;
	1884	}
	1885	break;
	1886	case 'b':
	1887	if (!(syntax & RE_NO_GNU_OPS))
	1888	{
	1889	token->type = ANCHOR;
	1890	token->opr.ctx_type = WORD_DELIM;
	1891	}
	1892	break;
	1893	case 'B':
	1894	if (!(syntax & RE_NO_GNU_OPS))
	1895	{
	1896	token->type = ANCHOR;
	1897	token->opr.ctx_type = NOT_WORD_DELIM;
	1898	}
	1899	break;
	1900	case 'w':
	1901	if (!(syntax & RE_NO_GNU_OPS))
	1902	token->type = OP_WORD;
	1903	break;
	1904	case 'W':
	1905	if (!(syntax & RE_NO_GNU_OPS))
	1906	token->type = OP_NOTWORD;
	1907	break;
	1908	case 's':
	1909	if (!(syntax & RE_NO_GNU_OPS))
	1910	token->type = OP_SPACE;
	1911	break;
	1912	case 'S':
	1913	if (!(syntax & RE_NO_GNU_OPS))
	1914	token->type = OP_NOTSPACE;
	1915	break;
	1916	case '`':
	1917	if (!(syntax & RE_NO_GNU_OPS))
	1918	{
	1919	token->type = ANCHOR;
	1920	token->opr.ctx_type = BUF_FIRST;
	1921	}
	1922	break;
	1923	case '\'':
	1924	if (!(syntax & RE_NO_GNU_OPS))
	1925	{
	1926	token->type = ANCHOR;
	1927	token->opr.ctx_type = BUF_LAST;
	1928	}
	1929	break;
	1930	case '(':
	1931	if (!(syntax & RE_NO_BK_PARENS))
	1932	token->type = OP_OPEN_SUBEXP;
	1933	break;
	1934	case ')':
	1935	if (!(syntax & RE_NO_BK_PARENS))
	1936	token->type = OP_CLOSE_SUBEXP;
	1937	break;
	1938	case '+':
	1939	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
	1940	token->type = OP_DUP_PLUS;
	1941	break;
	1942	case '?':
	1943	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
	1944	token->type = OP_DUP_QUESTION;
	1945	break;
	1946	case '{':
	1947	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
	1948	token->type = OP_OPEN_DUP_NUM;
	1949	break;
	1950	case '}':
	1951	if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
	1952	token->type = OP_CLOSE_DUP_NUM;
	1953	break;
	1954	default:
	1955	break;
	1956	}
	1957	return 2;
	1958	}
	1959
	1960	token->type = CHARACTER;
	1961	#ifdef RE_ENABLE_I18N
	1962	if (input->mb_cur_max > 1)
	1963	{
	1964	wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
	1965	token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
	1966	}
	1967	else
	1968	#endif
	1969	token->word_char = IS_WORD_CHAR (token->opr.c);
	1970
	1971	switch (c)
	1972	{
	1973	case '\n':
	1974	if (syntax & RE_NEWLINE_ALT)
	1975	token->type = OP_ALT;
	1976	break;
	1977	case '\|':
	1978	if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
	1979	token->type = OP_ALT;
	1980	break;
	1981	case '*':
	1982	token->type = OP_DUP_ASTERISK;
	1983	break;
	1984	case '+':
	1985	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
	1986	token->type = OP_DUP_PLUS;
	1987	break;
	1988	case '?':
	1989	if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
	1990	token->type = OP_DUP_QUESTION;
	1991	break;
	1992	case '{':
	1993	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
	1994	token->type = OP_OPEN_DUP_NUM;
	1995	break;
	1996	case '}':
	1997	if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
	1998	token->type = OP_CLOSE_DUP_NUM;
	1999	break;
	2000	case '(':
	2001	if (syntax & RE_NO_BK_PARENS)
	2002	token->type = OP_OPEN_SUBEXP;
	2003	break;
	2004	case ')':
	2005	if (syntax & RE_NO_BK_PARENS)
	2006	token->type = OP_CLOSE_SUBEXP;
	2007	break;
	2008	case '[':
	2009	token->type = OP_OPEN_BRACKET;
	2010	break;
	2011	case '.':
	2012	token->type = OP_PERIOD;
	2013	break;
	2014	case '^':
	2015	if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS \| RE_CARET_ANCHORS_HERE)) &&
	2016	re_string_cur_idx (input) != 0)
	2017	{
	2018	char prev = re_string_peek_byte (input, -1);
	2019	if (!(syntax & RE_NEWLINE_ALT) \|\| prev != '\n')
	2020	break;
	2021	}
	2022	token->type = ANCHOR;
	2023	token->opr.ctx_type = LINE_FIRST;
	2024	break;
	2025	case '$':
	2026	if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
	2027	re_string_cur_idx (input) + 1 != re_string_length (input))
	2028	{
	2029	re_token_t next;
	2030	re_string_skip_bytes (input, 1);
	2031	peek_token (&next, input, syntax);
	2032	re_string_skip_bytes (input, -1);
	2033	if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
	2034	break;
	2035	}
	2036	token->type = ANCHOR;
	2037	token->opr.ctx_type = LINE_LAST;
	2038	break;
	2039	default:
	2040	break;
	2041	}
	2042	return 1;
	2043	}
	2044
	2045	/* Peek a token from INPUT, and return the length of the token.
	2046	We must not use this function out of bracket expressions. */
	2047
	2048	static int
	2049	internal_function
	2050	peek_token_bracket (re_token_t token, re_string_t input, reg_syntax_t syntax)
	2051	{
	2052	unsigned char c;
	2053	if (re_string_eoi (input))
	2054	{
	2055	token->type = END_OF_RE;
	2056	return 0;
	2057	}
	2058	c = re_string_peek_byte (input, 0);
	2059	token->opr.c = c;
	2060
	2061	#ifdef RE_ENABLE_I18N
	2062	if (input->mb_cur_max > 1 &&
	2063	!re_string_first_byte (input, re_string_cur_idx (input)))
	2064	{
	2065	token->type = CHARACTER;
	2066	return 1;
	2067	}
	2068	#endif /* RE_ENABLE_I18N */
	2069
	2070	if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
	2071	&& re_string_cur_idx (input) + 1 < re_string_length (input))
	2072	{
	2073	/* In this case, '\' escape a character. */
	2074	unsigned char c2;
	2075	re_string_skip_bytes (input, 1);
	2076	c2 = re_string_peek_byte (input, 0);
	2077	token->opr.c = c2;
	2078	token->type = CHARACTER;
	2079	return 1;
	2080	}
	2081	if (c == '[') /* '[' is a special char in a bracket exps. */
	2082	{
	2083	unsigned char c2;
	2084	int token_len;
	2085	if (re_string_cur_idx (input) + 1 < re_string_length (input))
	2086	c2 = re_string_peek_byte (input, 1);
	2087	else
	2088	c2 = 0;
	2089	token->opr.c = c2;
	2090	token_len = 2;
	2091	switch (c2)
	2092	{
	2093	case '.':
	2094	token->type = OP_OPEN_COLL_ELEM;
	2095	break;
	2096	case '=':
	2097	token->type = OP_OPEN_EQUIV_CLASS;
	2098	break;
	2099	case ':':
	2100	if (syntax & RE_CHAR_CLASSES)
	2101	{
	2102	token->type = OP_OPEN_CHAR_CLASS;
	2103	break;
	2104	}
	2105	/* else fall through. */
	2106	default:
	2107	token->type = CHARACTER;
	2108	token->opr.c = c;
	2109	token_len = 1;
	2110	break;
	2111	}
	2112	return token_len;
	2113	}
	2114	switch (c)
	2115	{
	2116	case '-':
	2117	token->type = OP_CHARSET_RANGE;
	2118	break;
	2119	case ']':
	2120	token->type = OP_CLOSE_BRACKET;
	2121	break;
	2122	case '^':
	2123	token->type = OP_NON_MATCH_LIST;
	2124	break;
	2125	default:
	2126	token->type = CHARACTER;
	2127	}
	2128	return 1;
	2129	}
	2130	\f
	2131	/* Functions for parser. */
	2132
	2133	/* Entry point of the parser.
	2134	Parse the regular expression REGEXP and return the structure tree.
	2135	If an error occurs, ERR is set by error code, and return NULL.
	2136	This function build the following tree, from regular expression <reg_exp>:
	2137	CAT
	2138	/ \
	2139	/ \
	2140	<reg_exp> EOR
	2141
	2142	CAT means concatenation.
	2143	EOR means end of regular expression. */
	2144
	2145	static bin_tree_t *
	2146	parse (re_string_t regexp, regex_t preg, reg_syntax_t syntax,
	2147	reg_errcode_t *err)
	2148	{
	2149	re_dfa_t *dfa = preg->buffer;
	2150	bin_tree_t tree, eor, *root;
	2151	re_token_t current_token;
	2152	dfa->syntax = syntax;
	2153	fetch_token (&current_token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2154	tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
	2155	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2156	return NULL;
	2157	eor = create_tree (dfa, NULL, NULL, END_OF_RE);
	2158	if (tree != NULL)
	2159	root = create_tree (dfa, tree, eor, CONCAT);
	2160	else
	2161	root = eor;
	2162	if (BE (eor == NULL \|\| root == NULL, 0))
	2163	{
	2164	*err = REG_ESPACE;
	2165	return NULL;
	2166	}
	2167	return root;
	2168	}
	2169
	2170	/* This function build the following tree, from regular expression
	2171	<branch1>\|<branch2>:
	2172	ALT
	2173	/ \
	2174	/ \
	2175	<branch1> <branch2>
	2176
	2177	ALT means alternative, which represents the operator '\|'. */
	2178
	2179	static bin_tree_t *
	2180	parse_reg_exp (re_string_t regexp, regex_t preg, re_token_t *token,
	2181	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2182	{
	2183	re_dfa_t *dfa = preg->buffer;
	2184	bin_tree_t tree, branch = NULL;
	2185	tree = parse_branch (regexp, preg, token, syntax, nest, err);
	2186	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2187	return NULL;
	2188
	2189	while (token->type == OP_ALT)
	2190	{
	2191	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2192	if (token->type != OP_ALT && token->type != END_OF_RE
	2193	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
	2194	{
	2195	branch = parse_branch (regexp, preg, token, syntax, nest, err);
	2196	if (BE (*err != REG_NOERROR && branch == NULL, 0))
	2197	return NULL;
	2198	}
	2199	else
	2200	branch = NULL;
	2201	tree = create_tree (dfa, tree, branch, OP_ALT);
	2202	if (BE (tree == NULL, 0))
	2203	{
	2204	*err = REG_ESPACE;
	2205	return NULL;
	2206	}
	2207	}
	2208	return tree;
	2209	}
	2210
	2211	/* This function build the following tree, from regular expression
	2212	<exp1><exp2>:
	2213	CAT
	2214	/ \
	2215	/ \
	2216	<exp1> <exp2>
	2217
	2218	CAT means concatenation. */
	2219
	2220	static bin_tree_t *
	2221	parse_branch (re_string_t regexp, regex_t preg, re_token_t *token,
	2222	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2223	{
	2224	bin_tree_t tree, expr;
	2225	re_dfa_t *dfa = preg->buffer;
	2226	tree = parse_expression (regexp, preg, token, syntax, nest, err);
	2227	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2228	return NULL;
	2229
	2230	while (token->type != OP_ALT && token->type != END_OF_RE
	2231	&& (nest == 0 \|\| token->type != OP_CLOSE_SUBEXP))
	2232	{
	2233	expr = parse_expression (regexp, preg, token, syntax, nest, err);
	2234	if (BE (*err != REG_NOERROR && expr == NULL, 0))
	2235	{
	2236	if (tree != NULL)
	2237	postorder (tree, free_tree, NULL);
	2238	return NULL;
	2239	}
	2240	if (tree != NULL && expr != NULL)
	2241	{
	2242	bin_tree_t *newtree = create_tree (dfa, tree, expr, CONCAT);
	2243	if (newtree == NULL)
	2244	{
	2245	postorder (expr, free_tree, NULL);
	2246	postorder (tree, free_tree, NULL);
	2247	*err = REG_ESPACE;
	2248	return NULL;
	2249	}
	2250	tree = newtree;
	2251	}
	2252	else if (tree == NULL)
	2253	tree = expr;
	2254	/* Otherwise expr == NULL, we don't need to create new tree. */
	2255	}
	2256	return tree;
	2257	}
	2258
	2259	/* This function build the following tree, from regular expression a*:
	2260	*
	2261	\|
	2262	a
	2263	*/
	2264
	2265	static bin_tree_t *
	2266	parse_expression (re_string_t regexp, regex_t preg, re_token_t *token,
	2267	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2268	{
	2269	re_dfa_t *dfa = preg->buffer;
	2270	bin_tree_t *tree;
	2271	switch (token->type)
	2272	{
	2273	case CHARACTER:
	2274	tree = create_token_tree (dfa, NULL, NULL, token);
	2275	if (BE (tree == NULL, 0))
	2276	{
	2277	*err = REG_ESPACE;
	2278	return NULL;
	2279	}
	2280	#ifdef RE_ENABLE_I18N
	2281	if (dfa->mb_cur_max > 1)
	2282	{
	2283	while (!re_string_eoi (regexp)
	2284	&& !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
	2285	{
	2286	bin_tree_t *mbc_remain;
	2287	fetch_token (token, regexp, syntax);
	2288	mbc_remain = create_token_tree (dfa, NULL, NULL, token);
	2289	tree = create_tree (dfa, tree, mbc_remain, CONCAT);
	2290	if (BE (mbc_remain == NULL \|\| tree == NULL, 0))
	2291	{
	2292	*err = REG_ESPACE;
	2293	return NULL;
	2294	}
	2295	}
	2296	}
	2297	#endif
	2298	break;
	2299	case OP_OPEN_SUBEXP:
	2300	tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
	2301	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2302	return NULL;
	2303	break;
	2304	case OP_OPEN_BRACKET:
	2305	tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
	2306	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2307	return NULL;
	2308	break;
	2309	case OP_BACK_REF:
	2310	if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
	2311	{
	2312	*err = REG_ESUBREG;
	2313	return NULL;
	2314	}
	2315	dfa->used_bkref_map \|= 1 << token->opr.idx;
	2316	tree = create_token_tree (dfa, NULL, NULL, token);
	2317	if (BE (tree == NULL, 0))
	2318	{
	2319	*err = REG_ESPACE;
	2320	return NULL;
	2321	}
	2322	++dfa->nbackref;
	2323	dfa->has_mb_node = 1;
	2324	break;
	2325	case OP_OPEN_DUP_NUM:
	2326	if (syntax & RE_CONTEXT_INVALID_DUP)
	2327	{
	2328	*err = REG_BADRPT;
	2329	return NULL;
	2330	}
	2331	/* FALLTHROUGH */
	2332	case OP_DUP_ASTERISK:
	2333	case OP_DUP_PLUS:
	2334	case OP_DUP_QUESTION:
	2335	if (syntax & RE_CONTEXT_INVALID_OPS)
	2336	{
	2337	*err = REG_BADRPT;
	2338	return NULL;
	2339	}
	2340	else if (syntax & RE_CONTEXT_INDEP_OPS)
	2341	{
	2342	fetch_token (token, regexp, syntax);
	2343	return parse_expression (regexp, preg, token, syntax, nest, err);
	2344	}
	2345	/* else fall through */
	2346	case OP_CLOSE_SUBEXP:
	2347	if ((token->type == OP_CLOSE_SUBEXP) &&
	2348	!(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
	2349	{
	2350	*err = REG_ERPAREN;
	2351	return NULL;
	2352	}
	2353	/* else fall through */
	2354	case OP_CLOSE_DUP_NUM:
	2355	/* We treat it as a normal character. */
	2356
	2357	/* Then we can these characters as normal characters. */
	2358	token->type = CHARACTER;
	2359	/* mb_partial and word_char bits should be initialized already
	2360	by peek_token. */
	2361	tree = create_token_tree (dfa, NULL, NULL, token);
	2362	if (BE (tree == NULL, 0))
	2363	{
	2364	*err = REG_ESPACE;
	2365	return NULL;
	2366	}
	2367	break;
	2368	case ANCHOR:
	2369	if ((token->opr.ctx_type
	2370	& (WORD_DELIM \| NOT_WORD_DELIM \| WORD_FIRST \| WORD_LAST))
	2371	&& dfa->word_ops_used == 0)
	2372	init_word_char (dfa);
	2373	if (token->opr.ctx_type == WORD_DELIM
	2374	\|\| token->opr.ctx_type == NOT_WORD_DELIM)
	2375	{
	2376	bin_tree_t tree_first, tree_last;
	2377	if (token->opr.ctx_type == WORD_DELIM)
	2378	{
	2379	token->opr.ctx_type = WORD_FIRST;
	2380	tree_first = create_token_tree (dfa, NULL, NULL, token);
	2381	token->opr.ctx_type = WORD_LAST;
	2382	}
	2383	else
	2384	{
	2385	token->opr.ctx_type = INSIDE_WORD;
	2386	tree_first = create_token_tree (dfa, NULL, NULL, token);
	2387	token->opr.ctx_type = INSIDE_NOTWORD;
	2388	}
	2389	tree_last = create_token_tree (dfa, NULL, NULL, token);
	2390	tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
	2391	if (BE (tree_first == NULL \|\| tree_last == NULL \|\| tree == NULL, 0))
	2392	{
	2393	*err = REG_ESPACE;
	2394	return NULL;
	2395	}
	2396	}
	2397	else
	2398	{
	2399	tree = create_token_tree (dfa, NULL, NULL, token);
	2400	if (BE (tree == NULL, 0))
	2401	{
	2402	*err = REG_ESPACE;
	2403	return NULL;
	2404	}
	2405	}
	2406	/* We must return here, since ANCHORs can't be followed
	2407	by repetition operators.
	2408	eg. RE"^" is invalid or "<ANCHOR(^)><CHAR()>",
	2409	it must not be "<ANCHOR(^)><REPEAT()>". /
	2410	fetch_token (token, regexp, syntax);
	2411	return tree;
	2412	case OP_PERIOD:
	2413	tree = create_token_tree (dfa, NULL, NULL, token);
	2414	if (BE (tree == NULL, 0))
	2415	{
	2416	*err = REG_ESPACE;
	2417	return NULL;
	2418	}
	2419	if (dfa->mb_cur_max > 1)
	2420	dfa->has_mb_node = 1;
	2421	break;
	2422	case OP_WORD:
	2423	case OP_NOTWORD:
	2424	tree = build_charclass_op (dfa, regexp->trans,
	2425	"alnum",
	2426	"_",
	2427	token->type == OP_NOTWORD, err);
	2428	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2429	return NULL;
	2430	break;
	2431	case OP_SPACE:
	2432	case OP_NOTSPACE:
	2433	tree = build_charclass_op (dfa, regexp->trans,
	2434	"space",
	2435	"",
	2436	token->type == OP_NOTSPACE, err);
	2437	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2438	return NULL;
	2439	break;
	2440	case OP_ALT:
	2441	case END_OF_RE:
	2442	return NULL;
	2443	case BACK_SLASH:
	2444	*err = REG_EESCAPE;
	2445	return NULL;
	2446	default:
	2447	/* Must not happen? */
	2448	#ifdef DEBUG
	2449	assert (0);
	2450	#endif
	2451	return NULL;
	2452	}
	2453	fetch_token (token, regexp, syntax);
	2454
	2455	while (token->type == OP_DUP_ASTERISK \|\| token->type == OP_DUP_PLUS
	2456	\|\| token->type == OP_DUP_QUESTION \|\| token->type == OP_OPEN_DUP_NUM)
	2457	{
	2458	tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
	2459	if (BE (*err != REG_NOERROR && tree == NULL, 0))
	2460	return NULL;
	2461	/* In BRE consecutive duplications are not allowed. */
	2462	if ((syntax & RE_CONTEXT_INVALID_DUP)
	2463	&& (token->type == OP_DUP_ASTERISK
	2464	\|\| token->type == OP_OPEN_DUP_NUM))
	2465	{
	2466	*err = REG_BADRPT;
	2467	return NULL;
	2468	}
	2469	}
	2470
	2471	return tree;
	2472	}
	2473
	2474	/* This function build the following tree, from regular expression
	2475	(<reg_exp>):
	2476	SUBEXP
	2477	\|
	2478	<reg_exp>
	2479	*/
	2480
	2481	static bin_tree_t *
	2482	parse_sub_exp (re_string_t regexp, regex_t preg, re_token_t *token,
	2483	reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
	2484	{
	2485	re_dfa_t *dfa = preg->buffer;
	2486	bin_tree_t *tree;
	2487	size_t cur_nsub;
	2488	cur_nsub = preg->re_nsub++;
	2489
	2490	fetch_token (token, regexp, syntax \| RE_CARET_ANCHORS_HERE);
	2491
	2492	/* The subexpression may be a null string. */
	2493	if (token->type == OP_CLOSE_SUBEXP)
	2494	tree = NULL;
	2495	else
	2496	{
	2497	tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
	2498	if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
	2499	{
	2500	if (tree != NULL)
	2501	postorder (tree, free_tree, NULL);
	2502	*err = REG_EPAREN;
	2503	}
	2504	if (BE (*err != REG_NOERROR, 0))
	2505	return NULL;
	2506	}
	2507
	2508	if (cur_nsub <= '9' - '1')
	2509	dfa->completed_bkref_map \|= 1 << cur_nsub;
	2510
	2511	tree = create_tree (dfa, tree, NULL, SUBEXP);
	2512	if (BE (tree == NULL, 0))
	2513	{
	2514	*err = REG_ESPACE;
	2515	return NULL;
	2516	}
	2517	tree->token.opr.idx = cur_nsub;
	2518	return tree;
	2519	}
	2520
	2521	/* This function parse repetition operators like "", "+", "{1,3}" etc. /
	2522
	2523	static bin_tree_t *
	2524	parse_dup_op (bin_tree_t elem, re_string_t regexp, re_dfa_t *dfa,
	2525	re_token_t token, reg_syntax_t syntax, reg_errcode_t err)
	2526	{
	2527	bin_tree_t tree = NULL, old_tree = NULL;
	2528	Idx i, start, end, start_idx = re_string_cur_idx (regexp);
	2529	re_token_t start_token = *token;
	2530
	2531	if (token->type == OP_OPEN_DUP_NUM)
	2532	{
	2533	end = 0;
	2534	start = fetch_number (regexp, token, syntax);
	2535	if (start == REG_MISSING)
	2536	{
	2537	if (token->type == CHARACTER && token->opr.c == ',')
	2538	start = 0; /* We treat "{,m}" as "{0,m}". */
	2539	else
	2540	{
	2541	err = REG_BADBR; / <re>{} is invalid. */
	2542	return NULL;
	2543	}
	2544	}
	2545	if (BE (start != REG_ERROR, 1))
	2546	{
	2547	/* We treat "{n}" as "{n,n}". */
	2548	end = ((token->type == OP_CLOSE_DUP_NUM) ? start
	2549	: ((token->type == CHARACTER && token->opr.c == ',')
	2550	? fetch_number (regexp, token, syntax) : REG_ERROR));
	2551	}
	2552	if (BE (start == REG_ERROR \|\| end == REG_ERROR, 0))
	2553	{
	2554	/* Invalid sequence. */
	2555	if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
	2556	{
	2557	if (token->type == END_OF_RE)
	2558	*err = REG_EBRACE;
	2559	else
	2560	*err = REG_BADBR;
	2561
	2562	return NULL;
	2563	}
	2564
	2565	/* If the syntax bit is set, rollback. */
	2566	re_string_set_index (regexp, start_idx);
	2567	*token = start_token;
	2568	token->type = CHARACTER;
	2569	/* mb_partial and word_char bits should be already initialized by
	2570	peek_token. */
	2571	return elem;
	2572	}
	2573
	2574	if (BE ((end != REG_MISSING && start > end)
	2575	\|\| token->type != OP_CLOSE_DUP_NUM, 0))
	2576	{
	2577	/* First number greater than second. */
	2578	*err = REG_BADBR;
	2579	return NULL;
	2580	}
	2581
	2582	if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
	2583	{
	2584	*err = REG_ESIZE;
	2585	return NULL;
	2586	}
	2587	}
	2588	else
	2589	{
	2590	start = (token->type == OP_DUP_PLUS) ? 1 : 0;
	2591	end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
	2592	}
	2593
	2594	fetch_token (token, regexp, syntax);
	2595
	2596	if (BE (elem == NULL, 0))
	2597	return NULL;
	2598	if (BE (start == 0 && end == 0, 0))
	2599	{
	2600	postorder (elem, free_tree, NULL);
	2601	return NULL;
	2602	}
	2603
	2604	/* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
	2605	if (BE (start > 0, 0))
	2606	{
	2607	tree = elem;
	2608	for (i = 2; i <= start; ++i)
	2609	{
	2610	elem = duplicate_tree (elem, dfa);
	2611	tree = create_tree (dfa, tree, elem, CONCAT);
	2612	if (BE (elem == NULL \|\| tree == NULL, 0))
	2613	goto parse_dup_op_espace;
	2614	}
	2615
	2616	if (start == end)
	2617	return tree;
	2618
	2619	/* Duplicate ELEM before it is marked optional. */
	2620	elem = duplicate_tree (elem, dfa);
	2621	old_tree = tree;
	2622	}
	2623	else
	2624	old_tree = NULL;
	2625
	2626	if (elem->token.type == SUBEXP)
	2627	{
	2628	uintptr_t subidx = elem->token.opr.idx;
	2629	postorder (elem, mark_opt_subexp, (void *) subidx);
	2630	}
	2631
	2632	tree = create_tree (dfa, elem, NULL,
	2633	(end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
	2634	if (BE (tree == NULL, 0))
	2635	goto parse_dup_op_espace;
	2636
	2637	/* From gnulib's "intprops.h":
	2638	True if the arithmetic type T is signed. */
	2639	#define TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
	2640
	2641	/* This loop is actually executed only when end != REG_MISSING,
	2642	to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
	2643	already created the start+1-th copy. */
	2644	if (TYPE_SIGNED (Idx) \|\| end != REG_MISSING)
	2645	for (i = start + 2; i <= end; ++i)
	2646	{
	2647	elem = duplicate_tree (elem, dfa);
	2648	tree = create_tree (dfa, tree, elem, CONCAT);
	2649	if (BE (elem == NULL \|\| tree == NULL, 0))
	2650	goto parse_dup_op_espace;
	2651
	2652	tree = create_tree (dfa, tree, NULL, OP_ALT);
	2653	if (BE (tree == NULL, 0))
	2654	goto parse_dup_op_espace;
	2655	}
	2656
	2657	if (old_tree)
	2658	tree = create_tree (dfa, old_tree, tree, CONCAT);
	2659
	2660	return tree;
	2661
	2662	parse_dup_op_espace:
	2663	*err = REG_ESPACE;
	2664	return NULL;
	2665	}
	2666
	2667	/* Size of the names for collating symbol/equivalence_class/character_class.
	2668	I'm not sure, but maybe enough. */
	2669	#define BRACKET_NAME_BUF_SIZE 32
	2670
	2671	#ifndef _LIBC
	2672	/* Local function for parse_bracket_exp only used in case of NOT _LIBC.
	2673	Build the range expression which starts from START_ELEM, and ends
	2674	at END_ELEM. The result are written to MBCSET and SBCSET.
	2675	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
	2676	mbcset->range_ends, is a pointer argument since we may
	2677	update it. */
	2678
	2679	static reg_errcode_t
	2680	internal_function
	2681	# ifdef RE_ENABLE_I18N
	2682	build_range_exp (const reg_syntax_t syntax,
	2683	bitset_t sbcset,
	2684	re_charset_t *mbcset,
	2685	Idx *range_alloc,
	2686	const bracket_elem_t *start_elem,
	2687	const bracket_elem_t *end_elem)
	2688	# else /* not RE_ENABLE_I18N */
	2689	build_range_exp (const reg_syntax_t syntax,
	2690	bitset_t sbcset,
	2691	const bracket_elem_t *start_elem,
	2692	const bracket_elem_t *end_elem)
	2693	# endif /* not RE_ENABLE_I18N */
	2694	{
	2695	unsigned int start_ch, end_ch;
	2696	/* Equivalence Classes and Character Classes can't be a range start/end. */
	2697	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
	2698	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
	2699	0))
	2700	return REG_ERANGE;
	2701
	2702	/* We can handle no multi character collating elements without libc
	2703	support. */
	2704	if (BE ((start_elem->type == COLL_SYM
	2705	&& strlen ((char *) start_elem->opr.name) > 1)
	2706	\|\| (end_elem->type == COLL_SYM
	2707	&& strlen ((char *) end_elem->opr.name) > 1), 0))
	2708	return REG_ECOLLATE;
	2709
	2710	# ifdef RE_ENABLE_I18N
	2711	{
	2712	wchar_t wc;
	2713	wint_t start_wc;
	2714	wint_t end_wc;
	2715
	2716	start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
	2717	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
	2718	: 0));
	2719	end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
	2720	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
	2721	: 0));
	2722	start_wc = ((start_elem->type == SB_CHAR \|\| start_elem->type == COLL_SYM)
	2723	? __btowc (start_ch) : start_elem->opr.wch);
	2724	end_wc = ((end_elem->type == SB_CHAR \|\| end_elem->type == COLL_SYM)
	2725	? __btowc (end_ch) : end_elem->opr.wch);
	2726	if (start_wc == WEOF \|\| end_wc == WEOF)
	2727	return REG_ECOLLATE;
	2728	else if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_wc > end_wc, 0))
	2729	return REG_ERANGE;
	2730
	2731	/* Got valid collation sequence values, add them as a new entry.
	2732	However, for !_LIBC we have no collation elements: if the
	2733	character set is single byte, the single byte character set
	2734	that we build below suffices. parse_bracket_exp passes
	2735	no MBCSET if dfa->mb_cur_max == 1. */
	2736	if (mbcset)
	2737	{
	2738	/* Check the space of the arrays. */
	2739	if (BE (*range_alloc == mbcset->nranges, 0))
	2740	{
	2741	/* There is not enough space, need realloc. */
	2742	wchar_t new_array_start, new_array_end;
	2743	Idx new_nranges;
	2744
	2745	/* +1 in case of mbcset->nranges is 0. */
	2746	new_nranges = 2 * mbcset->nranges + 1;
	2747	/* Use realloc since mbcset->range_starts and mbcset->range_ends
	2748	are NULL if range_alloc == 0. /
	2749	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
	2750	new_nranges);
	2751	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
	2752	new_nranges);
	2753
	2754	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
	2755	return REG_ESPACE;
	2756
	2757	mbcset->range_starts = new_array_start;
	2758	mbcset->range_ends = new_array_end;
	2759	*range_alloc = new_nranges;
	2760	}
	2761
	2762	mbcset->range_starts[mbcset->nranges] = start_wc;
	2763	mbcset->range_ends[mbcset->nranges++] = end_wc;
	2764	}
	2765
	2766	/* Build the table for single byte characters. */
	2767	for (wc = 0; wc < SBC_MAX; ++wc)
	2768	{
	2769	if (start_wc <= wc && wc <= end_wc)
	2770	bitset_set (sbcset, wc);
	2771	}
	2772	}
	2773	# else /* not RE_ENABLE_I18N */
	2774	{
	2775	unsigned int ch;
	2776	start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
	2777	: ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
	2778	: 0));
	2779	end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
	2780	: ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
	2781	: 0));
	2782	if (start_ch > end_ch)
	2783	return REG_ERANGE;
	2784	/* Build the table for single byte characters. */
	2785	for (ch = 0; ch < SBC_MAX; ++ch)
	2786	if (start_ch <= ch && ch <= end_ch)
	2787	bitset_set (sbcset, ch);
	2788	}
	2789	# endif /* not RE_ENABLE_I18N */
	2790	return REG_NOERROR;
	2791	}
	2792	#endif /* not _LIBC */
	2793
	2794	#ifndef _LIBC
	2795	/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
	2796	Build the collating element which is represented by NAME.
	2797	The result are written to MBCSET and SBCSET.
	2798	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
	2799	pointer argument since we may update it. */
	2800
	2801	static reg_errcode_t
	2802	internal_function
	2803	# ifdef RE_ENABLE_I18N
	2804	build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
	2805	Idx coll_sym_alloc, const unsigned char name)
	2806	# else /* not RE_ENABLE_I18N */
	2807	build_collating_symbol (bitset_t sbcset, const unsigned char *name)
	2808	# endif /* not RE_ENABLE_I18N */
	2809	{
	2810	size_t name_len = strlen ((const char *) name);
	2811	if (BE (name_len != 1, 0))
	2812	return REG_ECOLLATE;
	2813	else
	2814	{
	2815	bitset_set (sbcset, name[0]);
	2816	return REG_NOERROR;
	2817	}
	2818	}
	2819	#endif /* not _LIBC */
	2820
	2821	/* This function parse bracket expression like "[abc]", "[a-c]",
	2822	"[[.a-a.]]" etc. */
	2823
	2824	static bin_tree_t *
	2825	parse_bracket_exp (re_string_t regexp, re_dfa_t dfa, re_token_t *token,
	2826	reg_syntax_t syntax, reg_errcode_t *err)
	2827	{
	2828	#ifdef _LIBC
	2829	const unsigned char *collseqmb;
	2830	const char *collseqwc;
	2831	uint32_t nrules;
	2832	int32_t table_size;
	2833	const int32_t *symb_table;
	2834	const unsigned char *extra;
	2835
	2836	/* Local function for parse_bracket_exp used in _LIBC environment.
	2837	Seek the collating symbol entry corresponding to NAME.
	2838	Return the index of the symbol in the SYMB_TABLE. */
	2839
	2840	auto inline int32_t
	2841	__attribute ((always_inline))
	2842	seek_collating_symbol_entry (name, name_len)
	2843	const unsigned char *name;
	2844	size_t name_len;
	2845	{
	2846	int32_t hash = elem_hash ((const char *) name, name_len);
	2847	int32_t elem = hash % table_size;
	2848	if (symb_table[2 * elem] != 0)
	2849	{
	2850	int32_t second = hash % (table_size - 2) + 1;
	2851
	2852	do
	2853	{
	2854	/* First compare the hashing value. */
	2855	if (symb_table[2 * elem] == hash
	2856	/* Compare the length of the name. */
	2857	&& name_len == extra[symb_table[2 * elem + 1]]
	2858	/* Compare the name. */
	2859	&& memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
	2860	name_len) == 0)
	2861	{
	2862	/* Yep, this is the entry. */
	2863	break;
	2864	}
	2865
	2866	/* Next entry. */
	2867	elem += second;
	2868	}
	2869	while (symb_table[2 * elem] != 0);
	2870	}
	2871	return elem;
	2872	}
	2873
	2874	/* Local function for parse_bracket_exp used in _LIBC environment.
	2875	Look up the collation sequence value of BR_ELEM.
	2876	Return the value if succeeded, UINT_MAX otherwise. */
	2877
	2878	auto inline unsigned int
	2879	__attribute ((always_inline))
	2880	lookup_collation_sequence_value (br_elem)
	2881	bracket_elem_t *br_elem;
	2882	{
	2883	if (br_elem->type == SB_CHAR)
	2884	{
	2885	/*
	2886	if (MB_CUR_MAX == 1)
	2887	*/
	2888	if (nrules == 0)
	2889	return collseqmb[br_elem->opr.ch];
	2890	else
	2891	{
	2892	wint_t wc = __btowc (br_elem->opr.ch);
	2893	return __collseq_table_lookup (collseqwc, wc);
	2894	}
	2895	}
	2896	else if (br_elem->type == MB_CHAR)
	2897	{
	2898	if (nrules != 0)
	2899	return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
	2900	}
	2901	else if (br_elem->type == COLL_SYM)
	2902	{
	2903	size_t sym_name_len = strlen ((char *) br_elem->opr.name);
	2904	if (nrules != 0)
	2905	{
	2906	int32_t elem, idx;
	2907	elem = seek_collating_symbol_entry (br_elem->opr.name,
	2908	sym_name_len);
	2909	if (symb_table[2 * elem] != 0)
	2910	{
	2911	/* We found the entry. */
	2912	idx = symb_table[2 * elem + 1];
	2913	/* Skip the name of collating element name. */
	2914	idx += 1 + extra[idx];
	2915	/* Skip the byte sequence of the collating element. */
	2916	idx += 1 + extra[idx];
	2917	/* Adjust for the alignment. */
	2918	idx = (idx + 3) & ~3;
	2919	/* Skip the multibyte collation sequence value. */
	2920	idx += sizeof (unsigned int);
	2921	/* Skip the wide char sequence of the collating element. */
	2922	idx += sizeof (unsigned int) *
	2923	(1 + (unsigned int ) (extra + idx));
	2924	/* Return the collation sequence value. */
	2925	return (unsigned int ) (extra + idx);
	2926	}
	2927	else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
	2928	{
	2929	/* No valid character. Match it as a single byte
	2930	character. */
	2931	return collseqmb[br_elem->opr.name[0]];
	2932	}
	2933	}
	2934	else if (sym_name_len == 1)
	2935	return collseqmb[br_elem->opr.name[0]];
	2936	}
	2937	return UINT_MAX;
	2938	}
	2939
	2940	/* Local function for parse_bracket_exp used in _LIBC environment.
	2941	Build the range expression which starts from START_ELEM, and ends
	2942	at END_ELEM. The result are written to MBCSET and SBCSET.
	2943	RANGE_ALLOC is the allocated size of mbcset->range_starts, and
	2944	mbcset->range_ends, is a pointer argument since we may
	2945	update it. */
	2946
	2947	auto inline reg_errcode_t
	2948	__attribute ((always_inline))
	2949	build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
	2950	re_charset_t *mbcset;
	2951	Idx *range_alloc;
	2952	bitset_t sbcset;
	2953	bracket_elem_t start_elem, end_elem;
	2954	{
	2955	unsigned int ch;
	2956	uint32_t start_collseq;
	2957	uint32_t end_collseq;
	2958
	2959	/* Equivalence Classes and Character Classes can't be a range
	2960	start/end. */
	2961	if (BE (start_elem->type == EQUIV_CLASS \|\| start_elem->type == CHAR_CLASS
	2962	\|\| end_elem->type == EQUIV_CLASS \|\| end_elem->type == CHAR_CLASS,
	2963	0))
	2964	return REG_ERANGE;
	2965
	2966	/* FIXME: Implement rational ranges here, too. */
	2967	start_collseq = lookup_collation_sequence_value (start_elem);
	2968	end_collseq = lookup_collation_sequence_value (end_elem);
	2969	/* Check start/end collation sequence values. */
	2970	if (BE (start_collseq == UINT_MAX \|\| end_collseq == UINT_MAX, 0))
	2971	return REG_ECOLLATE;
	2972	if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
	2973	return REG_ERANGE;
	2974
	2975	/* Got valid collation sequence values, add them as a new entry.
	2976	However, if we have no collation elements, and the character set
	2977	is single byte, the single byte character set that we
	2978	build below suffices. */
	2979	if (nrules > 0 \|\| dfa->mb_cur_max > 1)
	2980	{
	2981	/* Check the space of the arrays. */
	2982	if (BE (*range_alloc == mbcset->nranges, 0))
	2983	{
	2984	/* There is not enough space, need realloc. */
	2985	uint32_t *new_array_start;
	2986	uint32_t *new_array_end;
	2987	Idx new_nranges;
	2988
	2989	/* +1 in case of mbcset->nranges is 0. */
	2990	new_nranges = 2 * mbcset->nranges + 1;
	2991	new_array_start = re_realloc (mbcset->range_starts, uint32_t,
	2992	new_nranges);
	2993	new_array_end = re_realloc (mbcset->range_ends, uint32_t,
	2994	new_nranges);
	2995
	2996	if (BE (new_array_start == NULL \|\| new_array_end == NULL, 0))
	2997	return REG_ESPACE;
	2998
	2999	mbcset->range_starts = new_array_start;
	3000	mbcset->range_ends = new_array_end;
	3001	*range_alloc = new_nranges;
	3002	}
	3003
	3004	mbcset->range_starts[mbcset->nranges] = start_collseq;
	3005	mbcset->range_ends[mbcset->nranges++] = end_collseq;
	3006	}
	3007
	3008	/* Build the table for single byte characters. */
	3009	for (ch = 0; ch < SBC_MAX; ch++)
	3010	{
	3011	uint32_t ch_collseq;
	3012	/*
	3013	if (MB_CUR_MAX == 1)
	3014	*/
	3015	if (nrules == 0)
	3016	ch_collseq = collseqmb[ch];
	3017	else
	3018	ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
	3019	if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
	3020	bitset_set (sbcset, ch);
	3021	}
	3022	return REG_NOERROR;
	3023	}
	3024
	3025	/* Local function for parse_bracket_exp used in _LIBC environment.
	3026	Build the collating element which is represented by NAME.
	3027	The result are written to MBCSET and SBCSET.
	3028	COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
	3029	pointer argument since we may update it. */
	3030
	3031	auto inline reg_errcode_t
	3032	__attribute ((always_inline))
	3033	build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
	3034	re_charset_t *mbcset;
	3035	Idx *coll_sym_alloc;
	3036	bitset_t sbcset;
	3037	const unsigned char *name;
	3038	{
	3039	int32_t elem, idx;
	3040	size_t name_len = strlen ((const char *) name);
	3041	if (nrules != 0)
	3042	{
	3043	elem = seek_collating_symbol_entry (name, name_len);
	3044	if (symb_table[2 * elem] != 0)
	3045	{
	3046	/* We found the entry. */
	3047	idx = symb_table[2 * elem + 1];
	3048	/* Skip the name of collating element name. */
	3049	idx += 1 + extra[idx];
	3050	}
	3051	else if (symb_table[2 * elem] == 0 && name_len == 1)
	3052	{
	3053	/* No valid character, treat it as a normal
	3054	character. */
	3055	bitset_set (sbcset, name[0]);
	3056	return REG_NOERROR;
	3057	}
	3058	else
	3059	return REG_ECOLLATE;
	3060
	3061	/* Got valid collation sequence, add it as a new entry. */
	3062	/* Check the space of the arrays. */
	3063	if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
	3064	{
	3065	/* Not enough, realloc it. */
	3066	/* +1 in case of mbcset->ncoll_syms is 0. */
	3067	Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
	3068	/* Use realloc since mbcset->coll_syms is NULL
	3069	if alloc == 0. /
	3070	int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
	3071	new_coll_sym_alloc);
	3072	if (BE (new_coll_syms == NULL, 0))
	3073	return REG_ESPACE;
	3074	mbcset->coll_syms = new_coll_syms;
	3075	*coll_sym_alloc = new_coll_sym_alloc;
	3076	}
	3077	mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
	3078	return REG_NOERROR;
	3079	}
	3080	else
	3081	{
	3082	if (BE (name_len != 1, 0))
	3083	return REG_ECOLLATE;
	3084	else
	3085	{
	3086	bitset_set (sbcset, name[0]);
	3087	return REG_NOERROR;
	3088	}
	3089	}
	3090	}
	3091	#endif
	3092
	3093	re_token_t br_token;
	3094	re_bitset_ptr_t sbcset;
	3095	#ifdef RE_ENABLE_I18N
	3096	re_charset_t *mbcset;
	3097	Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
	3098	Idx equiv_class_alloc = 0, char_class_alloc = 0;
	3099	#endif /* not RE_ENABLE_I18N */
	3100	bool non_match = false;
	3101	bin_tree_t *work_tree;
	3102	int token_len;
	3103	bool first_round = true;
	3104	#ifdef _LIBC
	3105	collseqmb = (const unsigned char *)
	3106	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
	3107	nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
	3108	if (nrules)
	3109	{
	3110	/*
	3111	if (MB_CUR_MAX > 1)
	3112	*/
	3113	collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
	3114	table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
	3115	symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
	3116	_NL_COLLATE_SYMB_TABLEMB);
	3117	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3118	_NL_COLLATE_SYMB_EXTRAMB);
	3119	}
	3120	#endif
	3121	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
	3122	#ifdef RE_ENABLE_I18N
	3123	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
	3124	#endif /* RE_ENABLE_I18N */
	3125	#ifdef RE_ENABLE_I18N
	3126	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
	3127	#else
	3128	if (BE (sbcset == NULL, 0))
	3129	#endif /* RE_ENABLE_I18N */
	3130	{
	3131	re_free (sbcset);
	3132	#ifdef RE_ENABLE_I18N
	3133	re_free (mbcset);
	3134	#endif
	3135	*err = REG_ESPACE;
	3136	return NULL;
	3137	}
	3138
	3139	token_len = peek_token_bracket (token, regexp, syntax);
	3140	if (BE (token->type == END_OF_RE, 0))
	3141	{
	3142	*err = REG_BADPAT;
	3143	goto parse_bracket_exp_free_return;
	3144	}
	3145	if (token->type == OP_NON_MATCH_LIST)
	3146	{
	3147	#ifdef RE_ENABLE_I18N
	3148	mbcset->non_match = 1;
	3149	#endif /* not RE_ENABLE_I18N */
	3150	non_match = true;
	3151	if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
	3152	bitset_set (sbcset, '\n');
	3153	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3154	token_len = peek_token_bracket (token, regexp, syntax);
	3155	if (BE (token->type == END_OF_RE, 0))
	3156	{
	3157	*err = REG_BADPAT;
	3158	goto parse_bracket_exp_free_return;
	3159	}
	3160	}
	3161
	3162	/* We treat the first ']' as a normal character. */
	3163	if (token->type == OP_CLOSE_BRACKET)
	3164	token->type = CHARACTER;
	3165
	3166	while (1)
	3167	{
	3168	bracket_elem_t start_elem, end_elem;
	3169	unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
	3170	unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
	3171	reg_errcode_t ret;
	3172	int token_len2 = 0;
	3173	bool is_range_exp = false;
	3174	re_token_t token2;
	3175
	3176	start_elem.opr.name = start_name_buf;
	3177	ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
	3178	syntax, first_round);
	3179	if (BE (ret != REG_NOERROR, 0))
	3180	{
	3181	*err = ret;
	3182	goto parse_bracket_exp_free_return;
	3183	}
	3184	first_round = false;
	3185
	3186	/* Get information about the next token. We need it in any case. */
	3187	token_len = peek_token_bracket (token, regexp, syntax);
	3188
	3189	/* Do not check for ranges if we know they are not allowed. */
	3190	if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
	3191	{
	3192	if (BE (token->type == END_OF_RE, 0))
	3193	{
	3194	*err = REG_EBRACK;
	3195	goto parse_bracket_exp_free_return;
	3196	}
	3197	if (token->type == OP_CHARSET_RANGE)
	3198	{
	3199	re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
	3200	token_len2 = peek_token_bracket (&token2, regexp, syntax);
	3201	if (BE (token2.type == END_OF_RE, 0))
	3202	{
	3203	*err = REG_EBRACK;
	3204	goto parse_bracket_exp_free_return;
	3205	}
	3206	if (token2.type == OP_CLOSE_BRACKET)
	3207	{
	3208	/* We treat the last '-' as a normal character. */
	3209	re_string_skip_bytes (regexp, -token_len);
	3210	token->type = CHARACTER;
	3211	}
	3212	else
	3213	is_range_exp = true;
	3214	}
	3215	}
	3216
	3217	if (is_range_exp == true)
	3218	{
	3219	end_elem.opr.name = end_name_buf;
	3220	ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
	3221	dfa, syntax, true);
	3222	if (BE (ret != REG_NOERROR, 0))
	3223	{
	3224	*err = ret;
	3225	goto parse_bracket_exp_free_return;
	3226	}
	3227
	3228	token_len = peek_token_bracket (token, regexp, syntax);
	3229
	3230	#ifdef _LIBC
	3231	*err = build_range_exp (sbcset, mbcset, &range_alloc,
	3232	&start_elem, &end_elem);
	3233	#else
	3234	# ifdef RE_ENABLE_I18N
	3235	*err = build_range_exp (syntax, sbcset,
	3236	dfa->mb_cur_max > 1 ? mbcset : NULL,
	3237	&range_alloc, &start_elem, &end_elem);
	3238	# else
	3239	*err = build_range_exp (syntax, sbcset, &start_elem, &end_elem);
	3240	# endif
	3241	#endif /* RE_ENABLE_I18N */
	3242	if (BE (*err != REG_NOERROR, 0))
	3243	goto parse_bracket_exp_free_return;
	3244	}
	3245	else
	3246	{
	3247	switch (start_elem.type)
	3248	{
	3249	case SB_CHAR:
	3250	bitset_set (sbcset, start_elem.opr.ch);
	3251	break;
	3252	#ifdef RE_ENABLE_I18N
	3253	case MB_CHAR:
	3254	/* Check whether the array has enough space. */
	3255	if (BE (mbchar_alloc == mbcset->nmbchars, 0))
	3256	{
	3257	wchar_t *new_mbchars;
	3258	/* Not enough, realloc it. */
	3259	/* +1 in case of mbcset->nmbchars is 0. */
	3260	mbchar_alloc = 2 * mbcset->nmbchars + 1;
	3261	/* Use realloc since array is NULL if alloc == 0. /
	3262	new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
	3263	mbchar_alloc);
	3264	if (BE (new_mbchars == NULL, 0))
	3265	goto parse_bracket_exp_espace;
	3266	mbcset->mbchars = new_mbchars;
	3267	}
	3268	mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
	3269	break;
	3270	#endif /* RE_ENABLE_I18N */
	3271	case EQUIV_CLASS:
	3272	*err = build_equiv_class (sbcset,
	3273	#ifdef RE_ENABLE_I18N
	3274	mbcset, &equiv_class_alloc,
	3275	#endif /* RE_ENABLE_I18N */
	3276	start_elem.opr.name);
	3277	if (BE (*err != REG_NOERROR, 0))
	3278	goto parse_bracket_exp_free_return;
	3279	break;
	3280	case COLL_SYM:
	3281	*err = build_collating_symbol (sbcset,
	3282	#ifdef RE_ENABLE_I18N
	3283	mbcset, &coll_sym_alloc,
	3284	#endif /* RE_ENABLE_I18N */
	3285	start_elem.opr.name);
	3286	if (BE (*err != REG_NOERROR, 0))
	3287	goto parse_bracket_exp_free_return;
	3288	break;
	3289	case CHAR_CLASS:
	3290	*err = build_charclass (regexp->trans, sbcset,
	3291	#ifdef RE_ENABLE_I18N
	3292	mbcset, &char_class_alloc,
	3293	#endif /* RE_ENABLE_I18N */
	3294	(const char *) start_elem.opr.name,
	3295	syntax);
	3296	if (BE (*err != REG_NOERROR, 0))
	3297	goto parse_bracket_exp_free_return;
	3298	break;
	3299	default:
	3300	assert (0);
	3301	break;
	3302	}
	3303	}
	3304	if (BE (token->type == END_OF_RE, 0))
	3305	{
	3306	*err = REG_EBRACK;
	3307	goto parse_bracket_exp_free_return;
	3308	}
	3309	if (token->type == OP_CLOSE_BRACKET)
	3310	break;
	3311	}
	3312
	3313	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3314
	3315	/* If it is non-matching list. */
	3316	if (non_match)
	3317	bitset_not (sbcset);
	3318
	3319	#ifdef RE_ENABLE_I18N
	3320	/* Ensure only single byte characters are set. */
	3321	if (dfa->mb_cur_max > 1)
	3322	bitset_mask (sbcset, dfa->sb_char);
	3323
	3324	if (mbcset->nmbchars \|\| mbcset->ncoll_syms \|\| mbcset->nequiv_classes
	3325	\|\| mbcset->nranges \|\| (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
	3326	\|\| mbcset->non_match)))
	3327	{
	3328	bin_tree_t *mbc_tree;
	3329	int sbc_idx;
	3330	/* Build a tree for complex bracket. */
	3331	dfa->has_mb_node = 1;
	3332	br_token.type = COMPLEX_BRACKET;
	3333	br_token.opr.mbcset = mbcset;
	3334	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3335	if (BE (mbc_tree == NULL, 0))
	3336	goto parse_bracket_exp_espace;
	3337	for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
	3338	if (sbcset[sbc_idx])
	3339	break;
	3340	/* If there are no bits set in sbcset, there is no point
	3341	of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
	3342	if (sbc_idx < BITSET_WORDS)
	3343	{
	3344	/* Build a tree for simple bracket. */
	3345	br_token.type = SIMPLE_BRACKET;
	3346	br_token.opr.sbcset = sbcset;
	3347	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3348	if (BE (work_tree == NULL, 0))
	3349	goto parse_bracket_exp_espace;
	3350
	3351	/* Then join them by ALT node. */
	3352	work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
	3353	if (BE (work_tree == NULL, 0))
	3354	goto parse_bracket_exp_espace;
	3355	}
	3356	else
	3357	{
	3358	re_free (sbcset);
	3359	work_tree = mbc_tree;
	3360	}
	3361	}
	3362	else
	3363	#endif /* not RE_ENABLE_I18N */
	3364	{
	3365	#ifdef RE_ENABLE_I18N
	3366	free_charset (mbcset);
	3367	#endif
	3368	/* Build a tree for simple bracket. */
	3369	br_token.type = SIMPLE_BRACKET;
	3370	br_token.opr.sbcset = sbcset;
	3371	work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3372	if (BE (work_tree == NULL, 0))
	3373	goto parse_bracket_exp_espace;
	3374	}
	3375	return work_tree;
	3376
	3377	parse_bracket_exp_espace:
	3378	*err = REG_ESPACE;
	3379	parse_bracket_exp_free_return:
	3380	re_free (sbcset);
	3381	#ifdef RE_ENABLE_I18N
	3382	free_charset (mbcset);
	3383	#endif /* RE_ENABLE_I18N */
	3384	return NULL;
	3385	}
	3386
	3387	/* Parse an element in the bracket expression. */
	3388
	3389	static reg_errcode_t
	3390	parse_bracket_element (bracket_elem_t elem, re_string_t regexp,
	3391	re_token_t token, int token_len, re_dfa_t dfa,
	3392	reg_syntax_t syntax, bool accept_hyphen)
	3393	{
	3394	#ifdef RE_ENABLE_I18N
	3395	int cur_char_size;
	3396	cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
	3397	if (cur_char_size > 1)
	3398	{
	3399	elem->type = MB_CHAR;
	3400	elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
	3401	re_string_skip_bytes (regexp, cur_char_size);
	3402	return REG_NOERROR;
	3403	}
	3404	#endif /* RE_ENABLE_I18N */
	3405	re_string_skip_bytes (regexp, token_len); /* Skip a token. */
	3406	if (token->type == OP_OPEN_COLL_ELEM \|\| token->type == OP_OPEN_CHAR_CLASS
	3407	\|\| token->type == OP_OPEN_EQUIV_CLASS)
	3408	return parse_bracket_symbol (elem, regexp, token);
	3409	if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
	3410	{
	3411	/* A '-' must only appear as anything but a range indicator before
	3412	the closing bracket. Everything else is an error. */
	3413	re_token_t token2;
	3414	(void) peek_token_bracket (&token2, regexp, syntax);
	3415	if (token2.type != OP_CLOSE_BRACKET)
	3416	/* The actual error value is not standardized since this whole
	3417	case is undefined. But ERANGE makes good sense. */
	3418	return REG_ERANGE;
	3419	}
	3420	elem->type = SB_CHAR;
	3421	elem->opr.ch = token->opr.c;
	3422	return REG_NOERROR;
	3423	}
	3424
	3425	/* Parse a bracket symbol in the bracket expression. Bracket symbols are
	3426	such as [:<character_class>:], [.<collating_element>.], and
	3427	[=<equivalent_class>=]. */
	3428
	3429	static reg_errcode_t
	3430	parse_bracket_symbol (bracket_elem_t elem, re_string_t regexp,
	3431	re_token_t *token)
	3432	{
	3433	unsigned char ch, delim = token->opr.c;
	3434	int i = 0;
	3435	if (re_string_eoi(regexp))
	3436	return REG_EBRACK;
	3437	for (;; ++i)
	3438	{
	3439	if (i >= BRACKET_NAME_BUF_SIZE)
	3440	return REG_EBRACK;
	3441	if (token->type == OP_OPEN_CHAR_CLASS)
	3442	ch = re_string_fetch_byte_case (regexp);
	3443	else
	3444	ch = re_string_fetch_byte (regexp);
	3445	if (re_string_eoi(regexp))
	3446	return REG_EBRACK;
	3447	if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
	3448	break;
	3449	elem->opr.name[i] = ch;
	3450	}
	3451	re_string_skip_bytes (regexp, 1);
	3452	elem->opr.name[i] = '\0';
	3453	switch (token->type)
	3454	{
	3455	case OP_OPEN_COLL_ELEM:
	3456	elem->type = COLL_SYM;
	3457	break;
	3458	case OP_OPEN_EQUIV_CLASS:
	3459	elem->type = EQUIV_CLASS;
	3460	break;
	3461	case OP_OPEN_CHAR_CLASS:
	3462	elem->type = CHAR_CLASS;
	3463	break;
	3464	default:
	3465	break;
	3466	}
	3467	return REG_NOERROR;
	3468	}
	3469
	3470	/* Helper function for parse_bracket_exp.
	3471	Build the equivalence class which is represented by NAME.
	3472	The result are written to MBCSET and SBCSET.
	3473	EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
	3474	is a pointer argument since we may update it. */
	3475
	3476	static reg_errcode_t
	3477	#ifdef RE_ENABLE_I18N
	3478	build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
	3479	Idx equiv_class_alloc, const unsigned char name)
	3480	#else /* not RE_ENABLE_I18N */
	3481	build_equiv_class (bitset_t sbcset, const unsigned char *name)
	3482	#endif /* not RE_ENABLE_I18N */
	3483	{
	3484	#ifdef _LIBC
	3485	uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
	3486	if (nrules != 0)
	3487	{
	3488	const int32_t table, indirect;
	3489	const unsigned char weights, extra, *cp;
	3490	unsigned char char_buf[2];
	3491	int32_t idx1, idx2;
	3492	unsigned int ch;
	3493	size_t len;
	3494	/* This #include defines a local function! */
	3495	# include <locale/weight.h>
	3496	/* Calculate the index for equivalence class. */
	3497	cp = name;
	3498	table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
	3499	weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3500	_NL_COLLATE_WEIGHTMB);
	3501	extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
	3502	_NL_COLLATE_EXTRAMB);
	3503	indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
	3504	_NL_COLLATE_INDIRECTMB);
	3505	idx1 = findidx (&cp, -1);
	3506	if (BE (idx1 == 0 \|\| *cp != '\0', 0))
	3507	/* This isn't a valid character. */
	3508	return REG_ECOLLATE;
	3509
	3510	/* Build single byte matching table for this equivalence class. */
	3511	len = weights[idx1 & 0xffffff];
	3512	for (ch = 0; ch < SBC_MAX; ++ch)
	3513	{
	3514	char_buf[0] = ch;
	3515	cp = char_buf;
	3516	idx2 = findidx (&cp, 1);
	3517	/*
	3518	idx2 = table[ch];
	3519	*/
	3520	if (idx2 == 0)
	3521	/* This isn't a valid character. */
	3522	continue;
	3523	/* Compare only if the length matches and the collation rule
	3524	index is the same. */
	3525	if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
	3526	{
	3527	int cnt = 0;
	3528
	3529	while (cnt <= len &&
	3530	weights[(idx1 & 0xffffff) + 1 + cnt]
	3531	== weights[(idx2 & 0xffffff) + 1 + cnt])
	3532	++cnt;
	3533
	3534	if (cnt > len)
	3535	bitset_set (sbcset, ch);
	3536	}
	3537	}
	3538	/* Check whether the array has enough space. */
	3539	if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
	3540	{
	3541	/* Not enough, realloc it. */
	3542	/* +1 in case of mbcset->nequiv_classes is 0. */
	3543	Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
	3544	/* Use realloc since the array is NULL if alloc == 0. /
	3545	int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
	3546	int32_t,
	3547	new_equiv_class_alloc);
	3548	if (BE (new_equiv_classes == NULL, 0))
	3549	return REG_ESPACE;
	3550	mbcset->equiv_classes = new_equiv_classes;
	3551	*equiv_class_alloc = new_equiv_class_alloc;
	3552	}
	3553	mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
	3554	}
	3555	else
	3556	#endif /* _LIBC */
	3557	{
	3558	if (BE (strlen ((const char *) name) != 1, 0))
	3559	return REG_ECOLLATE;
	3560	bitset_set (sbcset, *name);
	3561	}
	3562	return REG_NOERROR;
	3563	}
	3564
	3565	/* Helper function for parse_bracket_exp.
	3566	Build the character class which is represented by NAME.
	3567	The result are written to MBCSET and SBCSET.
	3568	CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
	3569	is a pointer argument since we may update it. */
	3570
	3571	static reg_errcode_t
	3572	#ifdef RE_ENABLE_I18N
	3573	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
	3574	re_charset_t mbcset, Idx char_class_alloc,
	3575	const char *class_name, reg_syntax_t syntax)
	3576	#else /* not RE_ENABLE_I18N */
	3577	build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
	3578	const char *class_name, reg_syntax_t syntax)
	3579	#endif /* not RE_ENABLE_I18N */
	3580	{
	3581	int i;
	3582	const char *name = class_name;
	3583
	3584	/* In case of REG_ICASE "upper" and "lower" match the both of
	3585	upper and lower cases. */
	3586	if ((syntax & RE_ICASE)
	3587	&& (strcmp (name, "upper") == 0 \|\| strcmp (name, "lower") == 0))
	3588	name = "alpha";
	3589
	3590	#ifdef RE_ENABLE_I18N
	3591	/* Check the space of the arrays. */
	3592	if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
	3593	{
	3594	/* Not enough, realloc it. */
	3595	/* +1 in case of mbcset->nchar_classes is 0. */
	3596	Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
	3597	/* Use realloc since array is NULL if alloc == 0. /
	3598	wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
	3599	new_char_class_alloc);
	3600	if (BE (new_char_classes == NULL, 0))
	3601	return REG_ESPACE;
	3602	mbcset->char_classes = new_char_classes;
	3603	*char_class_alloc = new_char_class_alloc;
	3604	}
	3605	mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
	3606	#endif /* RE_ENABLE_I18N */
	3607
	3608	#define BUILD_CHARCLASS_LOOP(ctype_func) \
	3609	do { \
	3610	if (BE (trans != NULL, 0)) \
	3611	{ \
	3612	for (i = 0; i < SBC_MAX; ++i) \
	3613	if (ctype_func (i)) \
	3614	bitset_set (sbcset, trans[i]); \
	3615	} \
	3616	else \
	3617	{ \
	3618	for (i = 0; i < SBC_MAX; ++i) \
	3619	if (ctype_func (i)) \
	3620	bitset_set (sbcset, i); \
	3621	} \
	3622	} while (0)
	3623
	3624	if (strcmp (name, "alnum") == 0)
	3625	BUILD_CHARCLASS_LOOP (isalnum);
	3626	else if (strcmp (name, "cntrl") == 0)
	3627	BUILD_CHARCLASS_LOOP (iscntrl);
	3628	else if (strcmp (name, "lower") == 0)
	3629	BUILD_CHARCLASS_LOOP (islower);
	3630	else if (strcmp (name, "space") == 0)
	3631	BUILD_CHARCLASS_LOOP (isspace);
	3632	else if (strcmp (name, "alpha") == 0)
	3633	BUILD_CHARCLASS_LOOP (isalpha);
	3634	else if (strcmp (name, "digit") == 0)
	3635	BUILD_CHARCLASS_LOOP (isdigit);
	3636	else if (strcmp (name, "print") == 0)
	3637	BUILD_CHARCLASS_LOOP (isprint);
	3638	else if (strcmp (name, "upper") == 0)
	3639	BUILD_CHARCLASS_LOOP (isupper);
	3640	else if (strcmp (name, "blank") == 0)
	3641	BUILD_CHARCLASS_LOOP (isblank);
	3642	else if (strcmp (name, "graph") == 0)
	3643	BUILD_CHARCLASS_LOOP (isgraph);
	3644	else if (strcmp (name, "punct") == 0)
	3645	BUILD_CHARCLASS_LOOP (ispunct);
	3646	else if (strcmp (name, "xdigit") == 0)
	3647	BUILD_CHARCLASS_LOOP (isxdigit);
	3648	else
	3649	return REG_ECTYPE;
	3650
	3651	return REG_NOERROR;
	3652	}
	3653
	3654	static bin_tree_t *
	3655	build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
	3656	const char *class_name,
	3657	const char *extra, bool non_match,
	3658	reg_errcode_t *err)
	3659	{
	3660	re_bitset_ptr_t sbcset;
	3661	#ifdef RE_ENABLE_I18N
	3662	re_charset_t *mbcset;
	3663	Idx alloc = 0;
	3664	#endif /* not RE_ENABLE_I18N */
	3665	reg_errcode_t ret;
	3666	re_token_t br_token;
	3667	bin_tree_t *tree;
	3668
	3669	sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
	3670	#ifdef RE_ENABLE_I18N
	3671	mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
	3672	#endif /* RE_ENABLE_I18N */
	3673
	3674	#ifdef RE_ENABLE_I18N
	3675	if (BE (sbcset == NULL \|\| mbcset == NULL, 0))
	3676	#else /* not RE_ENABLE_I18N */
	3677	if (BE (sbcset == NULL, 0))
	3678	#endif /* not RE_ENABLE_I18N */
	3679	{
	3680	*err = REG_ESPACE;
	3681	return NULL;
	3682	}
	3683
	3684	if (non_match)
	3685	{
	3686	#ifdef RE_ENABLE_I18N
	3687	mbcset->non_match = 1;
	3688	#endif /* not RE_ENABLE_I18N */
	3689	}
	3690
	3691	/* We don't care the syntax in this case. */
	3692	ret = build_charclass (trans, sbcset,
	3693	#ifdef RE_ENABLE_I18N
	3694	mbcset, &alloc,
	3695	#endif /* RE_ENABLE_I18N */
	3696	class_name, 0);
	3697
	3698	if (BE (ret != REG_NOERROR, 0))
	3699	{
	3700	re_free (sbcset);
	3701	#ifdef RE_ENABLE_I18N
	3702	free_charset (mbcset);
	3703	#endif /* RE_ENABLE_I18N */
	3704	*err = ret;
	3705	return NULL;
	3706	}
	3707	/* \w match '_' also. */
	3708	for (; *extra; extra++)
	3709	bitset_set (sbcset, *extra);
	3710
	3711	/* If it is non-matching list. */
	3712	if (non_match)
	3713	bitset_not (sbcset);
	3714
	3715	#ifdef RE_ENABLE_I18N
	3716	/* Ensure only single byte characters are set. */
	3717	if (dfa->mb_cur_max > 1)
	3718	bitset_mask (sbcset, dfa->sb_char);
	3719	#endif
	3720
	3721	/* Build a tree for simple bracket. */
	3722	br_token.type = SIMPLE_BRACKET;
	3723	br_token.opr.sbcset = sbcset;
	3724	tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3725	if (BE (tree == NULL, 0))
	3726	goto build_word_op_espace;
	3727
	3728	#ifdef RE_ENABLE_I18N
	3729	if (dfa->mb_cur_max > 1)
	3730	{
	3731	bin_tree_t *mbc_tree;
	3732	/* Build a tree for complex bracket. */
	3733	br_token.type = COMPLEX_BRACKET;
	3734	br_token.opr.mbcset = mbcset;
	3735	dfa->has_mb_node = 1;
	3736	mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
	3737	if (BE (mbc_tree == NULL, 0))
	3738	goto build_word_op_espace;
	3739	/* Then join them by ALT node. */
	3740	tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
	3741	if (BE (mbc_tree != NULL, 1))
	3742	return tree;
	3743	}
	3744	else
	3745	{
	3746	free_charset (mbcset);
	3747	return tree;
	3748	}
	3749	#else /* not RE_ENABLE_I18N */
	3750	return tree;
	3751	#endif /* not RE_ENABLE_I18N */
	3752
	3753	build_word_op_espace:
	3754	re_free (sbcset);
	3755	#ifdef RE_ENABLE_I18N
	3756	free_charset (mbcset);
	3757	#endif /* RE_ENABLE_I18N */
	3758	*err = REG_ESPACE;
	3759	return NULL;
	3760	}
	3761
	3762	/* This is intended for the expressions like "a{1,3}".
	3763	Fetch a number from 'input', and return the number.
	3764	Return REG_MISSING if the number field is empty like "{,1}".
	3765	Return RE_DUP_MAX + 1 if the number field is too large.
	3766	Return REG_ERROR if an error occurred. */
	3767
	3768	static Idx
	3769	fetch_number (re_string_t input, re_token_t token, reg_syntax_t syntax)
	3770	{
	3771	Idx num = REG_MISSING;
	3772	unsigned char c;
	3773	while (1)
	3774	{
	3775	fetch_token (token, input, syntax);
	3776	c = token->opr.c;
	3777	if (BE (token->type == END_OF_RE, 0))
	3778	return REG_ERROR;
	3779	if (token->type == OP_CLOSE_DUP_NUM \|\| c == ',')
	3780	break;
	3781	num = ((token->type != CHARACTER \|\| c < '0' \|\| '9' < c
	3782	\|\| num == REG_ERROR)
	3783	? REG_ERROR
	3784	: num == REG_MISSING
	3785	? c - '0'
	3786	: MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
	3787	}
	3788	return num;
	3789	}
	3790	\f
	3791	#ifdef RE_ENABLE_I18N
	3792	static void
	3793	free_charset (re_charset_t *cset)
	3794	{
	3795	re_free (cset->mbchars);
	3796	# ifdef _LIBC
	3797	re_free (cset->coll_syms);
	3798	re_free (cset->equiv_classes);
	3799	re_free (cset->range_starts);
	3800	re_free (cset->range_ends);
	3801	# endif
	3802	re_free (cset->char_classes);
	3803	re_free (cset);
	3804	}
	3805	#endif /* RE_ENABLE_I18N */
	3806	\f
	3807	/* Functions for binary tree operation. */
	3808
	3809	/* Create a tree node. */
	3810
	3811	static bin_tree_t *
	3812	create_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
	3813	re_token_type_t type)
	3814	{
	3815	re_token_t t;
	3816	t.type = type;
	3817	return create_token_tree (dfa, left, right, &t);
	3818	}
	3819
	3820	static bin_tree_t *
	3821	create_token_tree (re_dfa_t dfa, bin_tree_t left, bin_tree_t *right,
	3822	const re_token_t *token)
	3823	{
	3824	bin_tree_t *tree;
	3825	if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
	3826	{
	3827	bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
	3828
	3829	if (storage == NULL)
	3830	return NULL;
	3831	storage->next = dfa->str_tree_storage;
	3832	dfa->str_tree_storage = storage;
	3833	dfa->str_tree_storage_idx = 0;
	3834	}
	3835	tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
	3836
	3837	tree->parent = NULL;
	3838	tree->left = left;
	3839	tree->right = right;
	3840	tree->token = *token;
	3841	tree->token.duplicated = 0;
	3842	tree->token.opt_subexp = 0;
	3843	tree->first = NULL;
	3844	tree->next = NULL;
	3845	tree->node_idx = REG_MISSING;
	3846
	3847	if (left != NULL)
	3848	left->parent = tree;
	3849	if (right != NULL)
	3850	right->parent = tree;
	3851	return tree;
	3852	}
	3853
	3854	/* Mark the tree SRC as an optional subexpression.
	3855	To be called from preorder or postorder. */
	3856
	3857	static reg_errcode_t
	3858	mark_opt_subexp (void extra, bin_tree_t node)
	3859	{
	3860	Idx idx = (uintptr_t) extra;
	3861	if (node->token.type == SUBEXP && node->token.opr.idx == idx)
	3862	node->token.opt_subexp = 1;
	3863
	3864	return REG_NOERROR;
	3865	}
	3866
	3867	/* Free the allocated memory inside NODE. */
	3868
	3869	static void
	3870	free_token (re_token_t *node)
	3871	{
	3872	#ifdef RE_ENABLE_I18N
	3873	if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
	3874	free_charset (node->opr.mbcset);
	3875	else
	3876	#endif /* RE_ENABLE_I18N */
	3877	if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
	3878	re_free (node->opr.sbcset);
	3879	}
	3880
	3881	/* Worker function for tree walking. Free the allocated memory inside NODE
	3882	and its children. */
	3883
	3884	static reg_errcode_t
	3885	free_tree (void extra, bin_tree_t node)
	3886	{
	3887	free_token (&node->token);
	3888	return REG_NOERROR;
	3889	}
	3890
	3891
	3892	/* Duplicate the node SRC, and return new node. This is a preorder
	3893	visit similar to the one implemented by the generic visitor, but
	3894	we need more infrastructure to maintain two parallel trees --- so,
	3895	it's easier to duplicate. */
	3896
	3897	static bin_tree_t *
	3898	duplicate_tree (const bin_tree_t root, re_dfa_t dfa)
	3899	{
	3900	const bin_tree_t *node;
	3901	bin_tree_t *dup_root;
	3902	bin_tree_t *p_new = &dup_root, dup_node = root->parent;
	3903
	3904	for (node = root; ; )
	3905	{
	3906	/* Create a new tree and link it back to the current parent. */
	3907	*p_new = create_token_tree (dfa, NULL, NULL, &node->token);
	3908	if (*p_new == NULL)
	3909	return NULL;
	3910	(*p_new)->parent = dup_node;
	3911	(*p_new)->token.duplicated = 1;
	3912	dup_node = *p_new;
	3913
	3914	/* Go to the left node, or up and to the right. */
	3915	if (node->left)
	3916	{
	3917	node = node->left;
	3918	p_new = &dup_node->left;
	3919	}
	3920	else
	3921	{
	3922	const bin_tree_t *prev = NULL;
	3923	while (node->right == prev \|\| node->right == NULL)
	3924	{
	3925	prev = node;
	3926	node = node->parent;
	3927	dup_node = dup_node->parent;
	3928	if (!node)
	3929	return dup_root;
	3930	}
	3931	node = node->right;
	3932	p_new = &dup_node->right;
	3933	}
	3934	}
	3935	}