Planet

navi

home

PPS

about

screenshots

download

development

forum

Context Navigation

source: downloads/boost_1_34_1/boost/regex/v4/basic_regex_parser.hpp @ 29

Last change on this file since 29 was 29, checked in by landauf, 17 years ago
updated boost from 1_33_1 to 1_34_1
File size: 64.1 KB

Line
1	/*
2	*
3	* Copyright (c) 2004
4	* John Maddock
5	*
6	* Use, modification and distribution are subject to the
7	* Boost Software License, Version 1.0. (See accompanying file
8	* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9	*
10	*/
11
12	/*
13	* LOCATION: see http://www.boost.org for most recent version.
14	* FILE basic_regex_parser.cpp
15	* VERSION see <boost/version.hpp>
16	* DESCRIPTION: Declares template class basic_regex_parser.
17	*/
18
19	#ifndef BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
20	#define BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
21
22	#ifdef BOOST_HAS_ABI_HEADERS
23	# include BOOST_ABI_PREFIX
24	#endif
25
26	namespace boost{
27	namespace re_detail{
28
29	#ifdef BOOST_MSVC
30	#pragma warning(push)
31	#pragma warning(disable:4244)
32	#endif
33
34	template <class charT, class traits>
35	class basic_regex_parser : public basic_regex_creator<charT, traits>
36	{
37	public:
38	basic_regex_parser(regex_data<charT, traits>* data);
39	void parse(const charT* p1, const charT* p2, unsigned flags);
40	void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
41
42	bool parse_all();
43	bool parse_basic();
44	bool parse_extended();
45	bool parse_literal();
46	bool parse_open_paren();
47	bool parse_basic_escape();
48	bool parse_extended_escape();
49	bool parse_match_any();
50	bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
51	bool parse_repeat_range(bool isbasic);
52	bool parse_alt();
53	bool parse_set();
54	bool parse_backref();
55	void parse_set_literal(basic_char_set<charT, traits>& char_set);
56	bool parse_inner_set(basic_char_set<charT, traits>& char_set);
57	bool parse_QE();
58	bool parse_perl_extension();
59	bool add_emacs_code(bool negate);
60	bool unwind_alts(std::ptrdiff_t last_paren_start);
61	digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
62	charT unescape_character();
63	regex_constants::syntax_option_type parse_options();
64
65	private:
66	typedef bool (basic_regex_parser::*parser_proc_type)();
67	typedef typename traits::string_type string_type;
68	typedef typename traits::char_class_type char_class_type;
69	parser_proc_type m_parser_proc; // the main parser to use
70	const charT* m_base; // the start of the string being parsed
71	const charT* m_end; // the end of the string being parsed
72	const charT* m_position; // our current parser position
73	unsigned m_mark_count; // how many sub-expressions we have
74	std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted).
75	std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative
76	bool m_has_case_change; // true if somewhere in the current block the case has changed
77	#if defined(BOOST_MSVC) && defined(_M_IX86)
78	// This is an ugly warning suppression workaround (for warnings inside std::vector
79	// that can not otherwise be suppressed)...
80	BOOST_STATIC_ASSERT(sizeof(long) >= sizeof(void*));
81	std::vector<long> m_alt_jumps; // list of alternative in the current scope.
82	#else
83	std::vector<std::ptrdiff_t> m_alt_jumps; // list of alternative in the current scope.
84	#endif
85
86	basic_regex_parser& operator=(const basic_regex_parser&);
87	basic_regex_parser(const basic_regex_parser&);
88	};
89
90	template <class charT, class traits>
91	basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
92	: basic_regex_creator<charT, traits>(data), m_mark_count(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false)
93	{
94	}
95
96	template <class charT, class traits>
97	void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned l_flags)
98	{
99	// pass l_flags on to base class:
100	this->init(l_flags);
101	// set up pointers:
102	m_position = m_base = p1;
103	m_end = p2;
104	// empty strings are errors:
105	if(p1 == p2)
106	{
107	fail(regex_constants::error_empty, 0);
108	return;
109	}
110	// select which parser to use:
111	switch(l_flags & regbase::main_option_type)
112	{
113	case regbase::perl_syntax_group:
114	m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
115	break;
116	case regbase::basic_syntax_group:
117	m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
118	break;
119	case regbase::literal:
120	m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
121	break;
122	}
123
124	// parse all our characters:
125	bool result = parse_all();
126	//
127	// Unwind our alternatives:
128	//
129	unwind_alts(-1);
130	// reset l_flags as a global scope (?imsx) may have altered them:
131	this->flags(l_flags);
132	// if we haven't gobbled up all the characters then we must
133	// have had an unexpected ')' :
134	if(!result)
135	{
136	fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_position));
137	return;
138	}
139	// if an error has been set then give up now:
140	if(this->m_pdata->m_status)
141	return;
142	// fill in our sub-expression count:
143	this->m_pdata->m_mark_count = 1 + m_mark_count;
144	this->finalize(p1, p2);
145	}
146
147	template <class charT, class traits>
148	void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
149	{
150	if(0 == this->m_pdata->m_status) // update the error code if not already set
151	this->m_pdata->m_status = error_code;
152	m_position = m_end; // don't bother parsing anything else
153	// get the error message:
154	std::string message = this->m_pdata->m_ptraits->error_string(error_code);
155	// and raise the exception, this will do nothing if exceptions are disabled:
156	#ifndef BOOST_NO_EXCEPTIONS
157	if(0 == (this->flags() & regex_constants::no_except))
158	{
159	boost::regex_error e(message, error_code, position);
160	e.raise();
161	}
162	#else
163	(void)position; // suppress warnings.
164	#endif
165	}
166
167	template <class charT, class traits>
168	bool basic_regex_parser<charT, traits>::parse_all()
169	{
170	bool result = true;
171	while(result && (m_position != m_end))
172	{
173	result = (this->*m_parser_proc)();
174	}
175	return result;
176	}
177
178	#ifdef BOOST_MSVC
179	#pragma warning(push)
180	#pragma warning(disable:4702)
181	#endif
182	template <class charT, class traits>
183	bool basic_regex_parser<charT, traits>::parse_basic()
184	{
185	switch(this->m_traits.syntax_type(*m_position))
186	{
187	case regex_constants::syntax_escape:
188	return parse_basic_escape();
189	case regex_constants::syntax_dot:
190	return parse_match_any();
191	case regex_constants::syntax_caret:
192	++m_position;
193	this->append_state(syntax_element_start_line);
194	break;
195	case regex_constants::syntax_dollar:
196	++m_position;
197	this->append_state(syntax_element_end_line);
198	break;
199	case regex_constants::syntax_star:
200	if(!(this->m_last_state) \|\| (this->m_last_state->type == syntax_element_start_line))
201	return parse_literal();
202	else
203	{
204	++m_position;
205	return parse_repeat();
206	}
207	case regex_constants::syntax_plus:
208	if(!(this->m_last_state) \|\| (this->m_last_state->type == syntax_element_start_line) \|\| !(this->flags() & regbase::emacs_ex))
209	return parse_literal();
210	else
211	{
212	++m_position;
213	return parse_repeat(1);
214	}
215	case regex_constants::syntax_question:
216	if(!(this->m_last_state) \|\| (this->m_last_state->type == syntax_element_start_line) \|\| !(this->flags() & regbase::emacs_ex))
217	return parse_literal();
218	else
219	{
220	++m_position;
221	return parse_repeat(0, 1);
222	}
223	case regex_constants::syntax_open_set:
224	return parse_set();
225	case regex_constants::syntax_newline:
226	if(this->flags() & regbase::newline_alt)
227	return parse_alt();
228	else
229	return parse_literal();
230	default:
231	return parse_literal();
232	}
233	return true;
234	}
235
236	template <class charT, class traits>
237	bool basic_regex_parser<charT, traits>::parse_extended()
238	{
239	bool result = true;
240	switch(this->m_traits.syntax_type(*m_position))
241	{
242	case regex_constants::syntax_open_mark:
243	return parse_open_paren();
244	case regex_constants::syntax_close_mark:
245	return false;
246	case regex_constants::syntax_escape:
247	return parse_extended_escape();
248	case regex_constants::syntax_dot:
249	return parse_match_any();
250	case regex_constants::syntax_caret:
251	++m_position;
252	this->append_state(
253	(this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
254	break;
255	case regex_constants::syntax_dollar:
256	++m_position;
257	this->append_state(
258	(this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
259	break;
260	case regex_constants::syntax_star:
261	if(m_position == this->m_base)
262	{
263	fail(regex_constants::error_badrepeat, 0);
264	return false;
265	}
266	++m_position;
267	return parse_repeat();
268	case regex_constants::syntax_question:
269	if(m_position == this->m_base)
270	{
271	fail(regex_constants::error_badrepeat, 0);
272	return false;
273	}
274	++m_position;
275	return parse_repeat(0,1);
276	case regex_constants::syntax_plus:
277	if(m_position == this->m_base)
278	{
279	fail(regex_constants::error_badrepeat, 0);
280	return false;
281	}
282	++m_position;
283	return parse_repeat(1);
284	case regex_constants::syntax_open_brace:
285	++m_position;
286	return parse_repeat_range(false);
287	case regex_constants::syntax_close_brace:
288	fail(regex_constants::error_brace, this->m_position - this->m_end);
289	return false;
290	case regex_constants::syntax_or:
291	return parse_alt();
292	case regex_constants::syntax_open_set:
293	return parse_set();
294	case regex_constants::syntax_newline:
295	if(this->flags() & regbase::newline_alt)
296	return parse_alt();
297	else
298	return parse_literal();
299	case regex_constants::syntax_hash:
300	//
301	// If we have a mod_x flag set, then skip until
302	// we get to a newline character:
303	//
304	if((this->flags()
305	& (regbase::no_perl_ex\|regbase::mod_x))
306	== regbase::mod_x)
307	{
308	while((m_position != m_end) && !is_separator(*m_position++)){}
309	return true;
310	}
311	// Otherwise fall through:
312	default:
313	result = parse_literal();
314	break;
315	}
316	return result;
317	}
318	#ifdef BOOST_MSVC
319	#pragma warning(pop)
320	#endif
321
322	template <class charT, class traits>
323	bool basic_regex_parser<charT, traits>::parse_literal()
324	{
325	// append this as a literal provided it's not a space character
326	// or the perl option regbase::mod_x is not set:
327	if(
328	((this->flags()
329	& (regbase::main_option_type\|regbase::mod_x\|regbase::no_perl_ex))
330	!= regbase::mod_x)
331	\|\| !this->m_traits.isctype(*m_position, this->m_mask_space))
332	this->append_literal(*m_position);
333	++m_position;
334	return true;
335	}
336
337	template <class charT, class traits>
338	bool basic_regex_parser<charT, traits>::parse_open_paren()
339	{
340	//
341	// skip the '(' and error check:
342	//
343	if(++m_position == m_end)
344	{
345	fail(regex_constants::error_paren, m_position - m_base);
346	return false;
347	}
348	//
349	// begin by checking for a perl-style (?...) extension:
350	//
351	if(
352	((this->flags() & (regbase::main_option_type \| regbase::no_perl_ex)) == 0)
353	\|\| ((this->flags() & (regbase::main_option_type \| regbase::emacs_ex)) == (regbase::basic_syntax_group\|regbase::emacs_ex))
354	)
355	{
356	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
357	return parse_perl_extension();
358	}
359	//
360	// update our mark count, and append the required state:
361	//
362	unsigned markid = 0;
363	if(0 == (this->flags() & regbase::nosubs))
364	markid = ++m_mark_count;
365	re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
366	pb->index = markid;
367	std::ptrdiff_t last_paren_start = this->getoffset(pb);
368	// back up insertion point for alternations, and set new point:
369	std::ptrdiff_t last_alt_point = m_alt_insert_point;
370	this->m_pdata->m_data.align();
371	m_alt_insert_point = this->m_pdata->m_data.size();
372	//
373	// back up the current flags in case we have a nested (?imsx) group:
374	//
375	regex_constants::syntax_option_type opts = this->flags();
376	bool old_case_change = m_has_case_change;
377	m_has_case_change = false; // no changes to this scope as yet...
378	//
379	// now recursively add more states, this will terminate when we get to a
380	// matching ')' :
381	//
382	parse_all();
383	//
384	// Unwind pushed alternatives:
385	//
386	if(0 == unwind_alts(last_paren_start))
387	return false;
388	//
389	// restore flags:
390	//
391	if(m_has_case_change)
392	{
393	// the case has changed in one or more of the alternatives
394	// within the scoped (...) block: we have to add a state
395	// to reset the case sensitivity:
396	static_cast<re_case*>(
397	this->append_state(syntax_element_toggle_case, sizeof(re_case))
398	)->icase = opts & regbase::icase;
399	}
400	this->flags(opts);
401	m_has_case_change = old_case_change;
402	//
403	// we either have a ')' or we have run out of characters prematurely:
404	//
405	if(m_position == m_end)
406	{
407	this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end));
408	return false;
409	}
410	BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
411	++m_position;
412	//
413	// append closing parenthesis state:
414	//
415	pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
416	pb->index = markid;
417	this->m_paren_start = last_paren_start;
418	//
419	// restore the alternate insertion point:
420	//
421	this->m_alt_insert_point = last_alt_point;
422	//
423	// allow backrefs to this mark:
424	//
425	if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT))
426	this->m_backrefs \|= 1u << (markid - 1);
427
428	return true;
429	}
430
431	template <class charT, class traits>
432	bool basic_regex_parser<charT, traits>::parse_basic_escape()
433	{
434	++m_position;
435	bool result = true;
436	switch(this->m_traits.escape_syntax_type(*m_position))
437	{
438	case regex_constants::syntax_open_mark:
439	return parse_open_paren();
440	case regex_constants::syntax_close_mark:
441	return false;
442	case regex_constants::syntax_plus:
443	if(this->flags() & regex_constants::bk_plus_qm)
444	{
445	++m_position;
446	return parse_repeat(1);
447	}
448	else
449	return parse_literal();
450	case regex_constants::syntax_question:
451	if(this->flags() & regex_constants::bk_plus_qm)
452	{
453	++m_position;
454	return parse_repeat(0, 1);
455	}
456	else
457	return parse_literal();
458	case regex_constants::syntax_open_brace:
459	if(this->flags() & regbase::no_intervals)
460	return parse_literal();
461	++m_position;
462	return parse_repeat_range(true);
463	case regex_constants::syntax_close_brace:
464	if(this->flags() & regbase::no_intervals)
465	return parse_literal();
466	fail(regex_constants::error_brace, this->m_position - this->m_base);
467	return false;
468	case regex_constants::syntax_or:
469	if(this->flags() & regbase::bk_vbar)
470	return parse_alt();
471	else
472	result = parse_literal();
473	break;
474	case regex_constants::syntax_digit:
475	return parse_backref();
476	case regex_constants::escape_type_start_buffer:
477	if(this->flags() & regbase::emacs_ex)
478	{
479	++m_position;
480	this->append_state(syntax_element_buffer_start);
481	}
482	else
483	result = parse_literal();
484	break;
485	case regex_constants::escape_type_end_buffer:
486	if(this->flags() & regbase::emacs_ex)
487	{
488	++m_position;
489	this->append_state(syntax_element_buffer_end);
490	}
491	else
492	result = parse_literal();
493	break;
494	case regex_constants::escape_type_word_assert:
495	if(this->flags() & regbase::emacs_ex)
496	{
497	++m_position;
498	this->append_state(syntax_element_word_boundary);
499	}
500	else
501	result = parse_literal();
502	break;
503	case regex_constants::escape_type_not_word_assert:
504	if(this->flags() & regbase::emacs_ex)
505	{
506	++m_position;
507	this->append_state(syntax_element_within_word);
508	}
509	else
510	result = parse_literal();
511	break;
512	case regex_constants::escape_type_left_word:
513	if(this->flags() & regbase::emacs_ex)
514	{
515	++m_position;
516	this->append_state(syntax_element_word_start);
517	}
518	else
519	result = parse_literal();
520	break;
521	case regex_constants::escape_type_right_word:
522	if(this->flags() & regbase::emacs_ex)
523	{
524	++m_position;
525	this->append_state(syntax_element_word_end);
526	}
527	else
528	result = parse_literal();
529	break;
530	default:
531	if(this->flags() & regbase::emacs_ex)
532	{
533	bool negate = true;
534	switch(*m_position)
535	{
536	case 'w':
537	negate = false;
538	// fall through:
539	case 'W':
540	{
541	basic_char_set<charT, traits> char_set;
542	if(negate)
543	char_set.negate();
544	char_set.add_class(this->m_word_mask);
545	if(0 == this->append_set(char_set))
546	{
547	fail(regex_constants::error_ctype, m_position - m_base);
548	return false;
549	}
550	++m_position;
551	return true;
552	}
553	case 's':
554	negate = false;
555	// fall through:
556	case 'S':
557	return add_emacs_code(negate);
558	case 'c':
559	case 'C':
560	// not supported yet:
561	fail(regex_constants::error_escape, m_position - m_base);
562	return false;
563	default:
564	break;
565	}
566	}
567	result = parse_literal();
568	break;
569	}
570	return result;
571	}
572
573	template <class charT, class traits>
574	bool basic_regex_parser<charT, traits>::parse_extended_escape()
575	{
576	++m_position;
577	bool negate = false; // in case this is a character class escape: \w \d etc
578	switch(this->m_traits.escape_syntax_type(*m_position))
579	{
580	case regex_constants::escape_type_not_class:
581	negate = true;
582	// fall through:
583	case regex_constants::escape_type_class:
584	{
585	typedef typename traits::char_class_type mask_type;
586	mask_type m = this->m_traits.lookup_classname(m_position, m_position+1);
587	if(m != 0)
588	{
589	basic_char_set<charT, traits> char_set;
590	if(negate)
591	char_set.negate();
592	char_set.add_class(m);
593	if(0 == this->append_set(char_set))
594	{
595	fail(regex_constants::error_ctype, m_position - m_base);
596	return false;
597	}
598	++m_position;
599	return true;
600	}
601	//
602	// not a class, just a regular unknown escape:
603	//
604	this->append_literal(unescape_character());
605	break;
606	}
607	case regex_constants::syntax_digit:
608	return parse_backref();
609	case regex_constants::escape_type_left_word:
610	++m_position;
611	this->append_state(syntax_element_word_start);
612	break;
613	case regex_constants::escape_type_right_word:
614	++m_position;
615	this->append_state(syntax_element_word_end);
616	break;
617	case regex_constants::escape_type_start_buffer:
618	++m_position;
619	this->append_state(syntax_element_buffer_start);
620	break;
621	case regex_constants::escape_type_end_buffer:
622	++m_position;
623	this->append_state(syntax_element_buffer_end);
624	break;
625	case regex_constants::escape_type_word_assert:
626	++m_position;
627	this->append_state(syntax_element_word_boundary);
628	break;
629	case regex_constants::escape_type_not_word_assert:
630	++m_position;
631	this->append_state(syntax_element_within_word);
632	break;
633	case regex_constants::escape_type_Z:
634	++m_position;
635	this->append_state(syntax_element_soft_buffer_end);
636	break;
637	case regex_constants::escape_type_Q:
638	return parse_QE();
639	case regex_constants::escape_type_C:
640	return parse_match_any();
641	case regex_constants::escape_type_X:
642	++m_position;
643	this->append_state(syntax_element_combining);
644	break;
645	case regex_constants::escape_type_G:
646	++m_position;
647	this->append_state(syntax_element_restart_continue);
648	break;
649	case regex_constants::escape_type_not_property:
650	negate = true;
651	// fall through:
652	case regex_constants::escape_type_property:
653	{
654	++m_position;
655	char_class_type m;
656	if(m_position == m_end)
657	{
658	fail(regex_constants::error_escape, m_position - m_base);
659	return false;
660	}
661	// maybe have \p{ddd}
662	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
663	{
664	const charT* base = m_position;
665	// skip forward until we find enclosing brace:
666	while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
667	++m_position;
668	if(m_position == m_end)
669	{
670	fail(regex_constants::error_escape, m_position - m_base);
671	return false;
672	}
673	m = this->m_traits.lookup_classname(++base, m_position++);
674	}
675	else
676	{
677	m = this->m_traits.lookup_classname(m_position, m_position+1);
678	++m_position;
679	}
680	if(m != 0)
681	{
682	basic_char_set<charT, traits> char_set;
683	if(negate)
684	char_set.negate();
685	char_set.add_class(m);
686	if(0 == this->append_set(char_set))
687	{
688	fail(regex_constants::error_ctype, m_position - m_base);
689	return false;
690	}
691	return true;
692	}
693	fail(regex_constants::error_ctype, m_position - m_base);
694	}
695	default:
696	this->append_literal(unescape_character());
697	break;
698	}
699	return true;
700	}
701
702	template <class charT, class traits>
703	bool basic_regex_parser<charT, traits>::parse_match_any()
704	{
705	//
706	// we have a '.' that can match any character:
707	//
708	++m_position;
709	static_cast<re_dot*>(
710	this->append_state(syntax_element_wild, sizeof(re_dot))
711	)->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
712	? re_detail::force_not_newline
713	: this->flags() & regbase::mod_s ?
714	re_detail::force_newline : re_detail::dont_care);
715	return true;
716	}
717
718	template <class charT, class traits>
719	bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
720	{
721	bool greedy = true;
722	std::size_t insert_point;
723	//
724	// when we get to here we may have a non-greedy ? mark still to come:
725	//
726	if((m_position != m_end)
727	&& (
728	(0 == (this->flags() & (regbase::main_option_type \| regbase::no_perl_ex)))
729	\|\| ((regbase::basic_syntax_group\|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type \| regbase::emacs_ex)))
730	)
731	)
732	{
733	// OK we have a perl regex, check for a '?':
734	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
735	{
736	greedy = false;
737	++m_position;
738	}
739	}
740	if(0 == this->m_last_state)
741	{
742	fail(regex_constants::error_badrepeat, ::boost::re_detail::distance(m_base, m_position));
743	return false;
744	}
745	if(this->m_last_state->type == syntax_element_endmark)
746	{
747	// insert a repeat before the '(' matching the last ')':
748	insert_point = this->m_paren_start;
749	}
750	else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
751	{
752	// the last state was a literal with more than one character, split it in two:
753	re_literal* lit = static_cast<re_literal*>(this->m_last_state);
754	charT c = (static_cast<charT>(static_cast<void>(lit+1)))[lit->length - 1];
755	--(lit->length);
756	// now append new state:
757	lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
758	lit->length = 1;
759	(static_cast<charT>(static_cast<void>(lit+1)))[0] = c;
760	insert_point = this->getoffset(this->m_last_state);
761	}
762	else
763	{
764	// repeat the last state whatever it was, need to add some error checking here:
765	switch(this->m_last_state->type)
766	{
767	case syntax_element_start_line:
768	case syntax_element_end_line:
769	case syntax_element_word_boundary:
770	case syntax_element_within_word:
771	case syntax_element_word_start:
772	case syntax_element_word_end:
773	case syntax_element_buffer_start:
774	case syntax_element_buffer_end:
775	case syntax_element_alt:
776	case syntax_element_soft_buffer_end:
777	case syntax_element_restart_continue:
778	case syntax_element_jump:
779	case syntax_element_startmark:
780	// can't legally repeat any of the above:
781	fail(regex_constants::error_badrepeat, m_position - m_base);
782	return false;
783	default:
784	// do nothing...
785	break;
786	}
787	insert_point = this->getoffset(this->m_last_state);
788	}
789	//
790	// OK we now know what to repeat, so insert the repeat around it:
791	//
792	re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
793	rep->min = low;
794	rep->max = high;
795	rep->greedy = greedy;
796	rep->leading = false;
797	// store our repeater position for later:
798	std::ptrdiff_t rep_off = this->getoffset(rep);
799	// and append a back jump to the repeat:
800	re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
801	jmp->alt.i = rep_off - this->getoffset(jmp);
802	this->m_pdata->m_data.align();
803	// now fill in the alt jump for the repeat:
804	rep = static_cast<re_repeat*>(this->getaddress(rep_off));
805	rep->alt.i = this->m_pdata->m_data.size() - rep_off;
806	return true;
807	}
808
809	template <class charT, class traits>
810	bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
811	{
812	//
813	// parse a repeat-range:
814	//
815	std::size_t min, max;
816	int v;
817	// skip whitespace:
818	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
819	++m_position;
820	// fail if at end:
821	if(this->m_position == this->m_end)
822	{
823	fail(regex_constants::error_brace, this->m_position - this->m_base);
824	return false;
825	}
826	// get min:
827	v = this->m_traits.toi(m_position, m_end, 10);
828	// skip whitespace:
829	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
830	++m_position;
831	if(v < 0)
832	{
833	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
834	return false;
835	}
836	else if(this->m_position == this->m_end)
837	{
838	fail(regex_constants::error_brace, this->m_position - this->m_base);
839	return false;
840	}
841	min = v;
842	// see if we have a comma:
843	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
844	{
845	// move on and error check:
846	++m_position;
847	// skip whitespace:
848	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
849	++m_position;
850	if(this->m_position == this->m_end)
851	{
852	fail(regex_constants::error_brace, this->m_position - this->m_base);
853	return false;
854	}
855	// get the value if any:
856	v = this->m_traits.toi(m_position, m_end, 10);
857	max = (v >= 0) ? v : (std::numeric_limits<std::size_t>::max)();
858	}
859	else
860	{
861	// no comma, max = min:
862	max = min;
863	}
864	// skip whitespace:
865	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
866	++m_position;
867	// OK now check trailing }:
868	if(this->m_position == this->m_end)
869	{
870	fail(regex_constants::error_brace, this->m_position - this->m_base);
871	return false;
872	}
873	if(isbasic)
874	{
875	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
876	{
877	++m_position;
878	if(this->m_position == this->m_end)
879	{
880	fail(regex_constants::error_brace, this->m_position - this->m_base);
881	return false;
882	}
883	}
884	else
885	{
886	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
887	return false;
888	}
889	}
890	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
891	++m_position;
892	else
893	{
894	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
895	return false;
896	}
897	//
898	// finally go and add the repeat, unless error:
899	//
900	if(min > max)
901	{
902	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
903	return false;
904	}
905	return parse_repeat(min, max);
906	}
907
908	template <class charT, class traits>
909	bool basic_regex_parser<charT, traits>::parse_alt()
910	{
911	//
912	// error check: if there have been no previous states,
913	// or if the last state was a '(' then error:
914	//
915	if((this->m_last_state == 0) \|\| (this->m_last_state->type == syntax_element_startmark))
916	{
917	fail(regex_constants::error_empty, this->m_position - this->m_base);
918	return false;
919	}
920	++m_position;
921	//
922	// we need to append a trailing jump:
923	//
924	re_syntax_base* pj = this->append_state(re_detail::syntax_element_jump, sizeof(re_jump));
925	std::ptrdiff_t jump_offset = this->getoffset(pj);
926	//
927	// now insert the alternative:
928	//
929	re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
930	jump_offset += re_alt_size;
931	this->m_pdata->m_data.align();
932	palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
933	//
934	// update m_alt_insert_point so that the next alternate gets
935	// inserted at the start of the second of the two we've just created:
936	//
937	this->m_alt_insert_point = this->m_pdata->m_data.size();
938	//
939	// the start of this alternative must have a case changes state
940	// if the current block has messed around with case changes:
941	//
942	if(m_has_case_change)
943	{
944	static_cast<re_case*>(
945	this->append_state(syntax_element_toggle_case, sizeof(re_case))
946	)->icase = this->m_icase;
947	}
948	//
949	// push the alternative onto our stack, a recursive
950	// implementation here is easier to understand (and faster
951	// as it happens), but causes all kinds of stack overflow problems
952	// on programs with small stacks (COM+).
953	//
954	m_alt_jumps.push_back(jump_offset);
955	return true;
956	}
957
958	template <class charT, class traits>
959	bool basic_regex_parser<charT, traits>::parse_set()
960	{
961	++m_position;
962	if(m_position == m_end)
963	{
964	fail(regex_constants::error_brack, m_position - m_base);
965	return false;
966	}
967	basic_char_set<charT, traits> char_set;
968
969	const charT* base = m_position; // where the '[' was
970	const charT* item_base = m_position; // where the '[' or '^' was
971
972	while(m_position != m_end)
973	{
974	switch(this->m_traits.syntax_type(*m_position))
975	{
976	case regex_constants::syntax_caret:
977	if(m_position == base)
978	{
979	char_set.negate();
980	++m_position;
981	item_base = m_position;
982	}
983	else
984	parse_set_literal(char_set);
985	break;
986	case regex_constants::syntax_close_set:
987	if(m_position == item_base)
988	{
989	parse_set_literal(char_set);
990	break;
991	}
992	else
993	{
994	++m_position;
995	if(0 == this->append_set(char_set))
996	{
997	fail(regex_constants::error_range, m_position - m_base);
998	return false;
999	}
1000	}
1001	return true;
1002	case regex_constants::syntax_open_set:
1003	if(parse_inner_set(char_set))
1004	break;
1005	return true;
1006	case regex_constants::syntax_escape:
1007	{
1008	//
1009	// look ahead and see if this is a character class shortcut
1010	// \d \w \s etc...
1011	//
1012	++m_position;
1013	if(this->m_traits.escape_syntax_type(*m_position)
1014	== regex_constants::escape_type_class)
1015	{
1016	char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1017	if(m != 0)
1018	{
1019	char_set.add_class(m);
1020	++m_position;
1021	break;
1022	}
1023	}
1024	else if(this->m_traits.escape_syntax_type(*m_position)
1025	== regex_constants::escape_type_not_class)
1026	{
1027	// negated character class:
1028	char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1029	if(m != 0)
1030	{
1031	char_set.add_negated_class(m);
1032	++m_position;
1033	break;
1034	}
1035	}
1036	// not a character class, just a regular escape:
1037	--m_position;
1038	parse_set_literal(char_set);
1039	break;
1040	}
1041	default:
1042	parse_set_literal(char_set);
1043	break;
1044	}
1045	}
1046	return m_position != m_end;
1047	}
1048
1049	template <class charT, class traits>
1050	bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
1051	{
1052	//
1053	// we have either a character class [:name:]
1054	// a collating element [.name.]
1055	// or an equivalence class [=name=]
1056	//
1057	if(m_end == ++m_position)
1058	{
1059	fail(regex_constants::error_brack, m_position - m_base);
1060	return false;
1061	}
1062	switch(this->m_traits.syntax_type(*m_position))
1063	{
1064	case regex_constants::syntax_dot:
1065	//
1066	// a collating element is treated as a literal:
1067	//
1068	--m_position;
1069	parse_set_literal(char_set);
1070	return true;
1071	case regex_constants::syntax_colon:
1072	{
1073	// check that character classes are actually enabled:
1074	if((this->flags() & (regbase::main_option_type \| regbase::no_char_classes))
1075	== (regbase::basic_syntax_group \| regbase::no_char_classes))
1076	{
1077	--m_position;
1078	parse_set_literal(char_set);
1079	return true;
1080	}
1081	// skip the ':'
1082	if(m_end == ++m_position)
1083	{
1084	fail(regex_constants::error_brack, m_position - m_base);
1085	return false;
1086	}
1087	const charT* name_first = m_position;
1088	// skip at least one character, then find the matching ':]'
1089	if(m_end == ++m_position)
1090	{
1091	fail(regex_constants::error_brack, m_position - m_base);
1092	return false;
1093	}
1094	while((m_position != m_end)
1095	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
1096	++m_position;
1097	const charT* name_last = m_position;
1098	if(m_end == m_position)
1099	{
1100	fail(regex_constants::error_brack, m_position - m_base);
1101	return false;
1102	}
1103	if((m_end == ++m_position)
1104	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1105	{
1106	fail(regex_constants::error_brack, m_position - m_base);
1107	return false;
1108	}
1109	//
1110	// check for negated class:
1111	//
1112	bool negated = false;
1113	if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
1114	{
1115	++name_first;
1116	negated = true;
1117	}
1118	typedef typename traits::char_class_type mask_type;
1119	mask_type m = this->m_traits.lookup_classname(name_first, name_last);
1120	if(m == 0)
1121	{
1122	if(char_set.empty() && (name_last - name_first == 1))
1123	{
1124	// maybe a special case:
1125	++m_position;
1126	if( (m_position != m_end)
1127	&& (this->m_traits.syntax_type(*m_position)
1128	== regex_constants::syntax_close_set))
1129	{
1130	if(this->m_traits.escape_syntax_type(*name_first)
1131	== regex_constants::escape_type_left_word)
1132	{
1133	++m_position;
1134	this->append_state(syntax_element_word_start);
1135	return false;
1136	}
1137	if(this->m_traits.escape_syntax_type(*name_first)
1138	== regex_constants::escape_type_right_word)
1139	{
1140	++m_position;
1141	this->append_state(syntax_element_word_end);
1142	return false;
1143	}
1144	}
1145	}
1146	fail(regex_constants::error_ctype, name_first - m_base);
1147	return false;
1148	}
1149	if(negated == false)
1150	char_set.add_class(m);
1151	else
1152	char_set.add_negated_class(m);
1153	++m_position;
1154	break;
1155	}
1156	case regex_constants::syntax_equal:
1157	{
1158	// skip the '='
1159	if(m_end == ++m_position)
1160	{
1161	fail(regex_constants::error_brack, m_position - m_base);
1162	return false;
1163	}
1164	const charT* name_first = m_position;
1165	// skip at least one character, then find the matching '=]'
1166	if(m_end == ++m_position)
1167	{
1168	fail(regex_constants::error_brack, m_position - m_base);
1169	return false;
1170	}
1171	while((m_position != m_end)
1172	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
1173	++m_position;
1174	const charT* name_last = m_position;
1175	if(m_end == m_position)
1176	{
1177	fail(regex_constants::error_brack, m_position - m_base);
1178	return false;
1179	}
1180	if((m_end == ++m_position)
1181	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1182	{
1183	fail(regex_constants::error_brack, m_position - m_base);
1184	return false;
1185	}
1186	string_type m = this->m_traits.lookup_collatename(name_first, name_last);
1187	if((0 == m.size()) \|\| (m.size() > 2))
1188	{
1189	fail(regex_constants::error_collate, name_first - m_base);
1190	return false;
1191	}
1192	digraph<charT> d;
1193	d.first = m[0];
1194	if(m.size() > 1)
1195	d.second = m[1];
1196	else
1197	d.second = 0;
1198	char_set.add_equivalent(d);
1199	++m_position;
1200	break;
1201	}
1202	default:
1203	--m_position;
1204	parse_set_literal(char_set);
1205	break;
1206	}
1207	return true;
1208	}
1209
1210	template <class charT, class traits>
1211	void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
1212	{
1213	digraph<charT> start_range(get_next_set_literal(char_set));
1214	if(m_end == m_position)
1215	{
1216	fail(regex_constants::error_brack, m_position - m_base);
1217	return;
1218	}
1219	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1220	{
1221	// we have a range:
1222	if(m_end == ++m_position)
1223	{
1224	fail(regex_constants::error_brack, m_position - m_base);
1225	return;
1226	}
1227	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
1228	{
1229	digraph<charT> end_range = get_next_set_literal(char_set);
1230	char_set.add_range(start_range, end_range);
1231	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1232	{
1233	if(m_end == ++m_position)
1234	{
1235	fail(regex_constants::error_brack, m_position - m_base);
1236	return;
1237	}
1238	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
1239	{
1240	// trailing - :
1241	--m_position;
1242	return;
1243	}
1244	fail(regex_constants::error_range, m_position - m_base);
1245	return;
1246	}
1247	return;
1248	}
1249	--m_position;
1250	}
1251	char_set.add_single(start_range);
1252	}
1253
1254	template <class charT, class traits>
1255	digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
1256	{
1257	digraph<charT> result;
1258	switch(this->m_traits.syntax_type(*m_position))
1259	{
1260	case regex_constants::syntax_dash:
1261	if(!char_set.empty())
1262	{
1263	// see if we are at the end of the set:
1264	if((++m_position == m_end) \|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1265	{
1266	fail(regex_constants::error_range, m_position - m_base);
1267	return result;
1268	}
1269	--m_position;
1270	}
1271	result.first = *m_position++;
1272	return result;
1273	case regex_constants::syntax_escape:
1274	// check to see if escapes are supported first:
1275	if(this->flags() & regex_constants::no_escape_in_lists)
1276	{
1277	result = *m_position++;
1278	break;
1279	}
1280	++m_position;
1281	result = unescape_character();
1282	break;
1283	case regex_constants::syntax_open_set:
1284	{
1285	if(m_end == ++m_position)
1286	{
1287	fail(regex_constants::error_collate, m_position - m_base);
1288	return result;
1289	}
1290	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
1291	{
1292	--m_position;
1293	result.first = *m_position;
1294	++m_position;
1295	return result;
1296	}
1297	if(m_end == ++m_position)
1298	{
1299	fail(regex_constants::error_collate, m_position - m_base);
1300	return result;
1301	}
1302	const charT* name_first = m_position;
1303	// skip at least one character, then find the matching ':]'
1304	if(m_end == ++m_position)
1305	{
1306	fail(regex_constants::error_collate, name_first - m_base);
1307	return result;
1308	}
1309	while((m_position != m_end)
1310	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
1311	++m_position;
1312	const charT* name_last = m_position;
1313	if(m_end == m_position)
1314	{
1315	fail(regex_constants::error_collate, name_first - m_base);
1316	return result;
1317	}
1318	if((m_end == ++m_position)
1319	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1320	{
1321	fail(regex_constants::error_collate, name_first - m_base);
1322	return result;
1323	}
1324	++m_position;
1325	string_type s = this->m_traits.lookup_collatename(name_first, name_last);
1326	if(s.empty() \|\| (s.size() > 2))
1327	{
1328	fail(regex_constants::error_collate, name_first - m_base);
1329	return result;
1330	}
1331	result.first = s[0];
1332	if(s.size() > 1)
1333	result.second = s[1];
1334	else
1335	result.second = 0;
1336	return result;
1337	}
1338	default:
1339	result = *m_position++;
1340	}
1341	return result;
1342	}
1343
1344	//
1345	// does a value fit in the specified charT type?
1346	//
1347	template <class charT>
1348	bool valid_value(charT, int v, const mpl::true_&)
1349	{
1350	return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
1351	}
1352	template <class charT>
1353	bool valid_value(charT, int, const mpl::false_&)
1354	{
1355	return true; // v will alsways fit in a charT
1356	}
1357	template <class charT>
1358	bool valid_value(charT c, int v)
1359	{
1360	return valid_value(c, v, mpl::bool_<(sizeof(charT) < sizeof(int))>());
1361	}
1362
1363	template <class charT, class traits>
1364	charT basic_regex_parser<charT, traits>::unescape_character()
1365	{
1366	#ifdef BOOST_MSVC
1367	#pragma warning(push)
1368	#pragma warning(disable:4127)
1369	#endif
1370	charT result(0);
1371	if(m_position == m_end)
1372	{
1373	fail(regex_constants::error_escape, m_position - m_base);
1374	return false;
1375	}
1376	switch(this->m_traits.escape_syntax_type(*m_position))
1377	{
1378	case regex_constants::escape_type_control_a:
1379	result = charT('\a');
1380	break;
1381	case regex_constants::escape_type_e:
1382	result = charT(27);
1383	break;
1384	case regex_constants::escape_type_control_f:
1385	result = charT('\f');
1386	break;
1387	case regex_constants::escape_type_control_n:
1388	result = charT('\n');
1389	break;
1390	case regex_constants::escape_type_control_r:
1391	result = charT('\r');
1392	break;
1393	case regex_constants::escape_type_control_t:
1394	result = charT('\t');
1395	break;
1396	case regex_constants::escape_type_control_v:
1397	result = charT('\v');
1398	break;
1399	case regex_constants::escape_type_word_assert:
1400	result = charT('\b');
1401	break;
1402	case regex_constants::escape_type_ascii_control:
1403	++m_position;
1404	if(m_position == m_end)
1405	{
1406	fail(regex_constants::error_escape, m_position - m_base);
1407	return result;
1408	}
1409	/*
1410	if((*m_position < charT('@'))
1411	\|\| (*m_position > charT(125)) )
1412	{
1413	fail(regex_constants::error_escape, m_position - m_base);
1414	return result;
1415	}
1416	*/
1417	result = static_cast<charT>(*m_position % 32);
1418	break;
1419	case regex_constants::escape_type_hex:
1420	++m_position;
1421	if(m_position == m_end)
1422	{
1423	fail(regex_constants::error_escape, m_position - m_base);
1424	return result;
1425	}
1426	// maybe have \x{ddd}
1427	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1428	{
1429	++m_position;
1430	if(m_position == m_end)
1431	{
1432	fail(regex_constants::error_escape, m_position - m_base);
1433	return result;
1434	}
1435	int i = this->m_traits.toi(m_position, m_end, 16);
1436	if((m_position == m_end)
1437	\|\| (i < 0)
1438	\|\| ((std::numeric_limits<charT>::is_specialized) && (charT(i) > (std::numeric_limits<charT>::max)()))
1439	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1440	{
1441	fail(regex_constants::error_badbrace, m_position - m_base);
1442	return result;
1443	}
1444	++m_position;
1445	result = charT(i);
1446	}
1447	else
1448	{
1449	std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), m_end - m_position);
1450	int i = this->m_traits.toi(m_position, m_position + len, 16);
1451	if((i < 0)
1452	\|\| !valid_value(charT(0), i))
1453	{
1454	fail(regex_constants::error_escape, m_position - m_base);
1455	return result;
1456	}
1457	result = charT(i);
1458	}
1459	return result;
1460	case regex_constants::syntax_digit:
1461	{
1462	// an octal escape sequence, the first character must be a zero
1463	// followed by up to 3 octal digits:
1464	std::ptrdiff_t len = (std::min)(::boost::re_detail::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
1465	const charT* bp = m_position;
1466	int val = this->m_traits.toi(bp, bp + 1, 8);
1467	if(val != 0)
1468	{
1469	// Oops not an octal escape after all:
1470	fail(regex_constants::error_escape, m_position - m_base);
1471	return result;
1472	}
1473	val = this->m_traits.toi(m_position, m_position + len, 8);
1474	if(val < 0)
1475	{
1476	fail(regex_constants::error_escape, m_position - m_base);
1477	return result;
1478	}
1479	return static_cast<charT>(val);
1480	}
1481	case regex_constants::escape_type_named_char:
1482	{
1483	++m_position;
1484	if(m_position == m_end)
1485	{
1486	fail(regex_constants::error_escape, m_position - m_base);
1487	return false;
1488	}
1489	// maybe have \N{name}
1490	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1491	{
1492	const charT* base = m_position;
1493	// skip forward until we find enclosing brace:
1494	while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1495	++m_position;
1496	if(m_position == m_end)
1497	{
1498	fail(regex_constants::error_escape, m_position - m_base);
1499	return false;
1500	}
1501	string_type s = this->m_traits.lookup_collatename(++base, m_position++);
1502	if(s.empty())
1503	{
1504	fail(regex_constants::error_collate, m_position - m_base);
1505	return false;
1506	}
1507	if(s.size() == 1)
1508	{
1509	return s[0];
1510	}
1511	}
1512	// fall through is a failure:
1513	fail(regex_constants::error_escape, m_position - m_base);
1514	return false;
1515	}
1516	default:
1517	result = *m_position;
1518	break;
1519	}
1520	++m_position;
1521	return result;
1522	#ifdef BOOST_MSVC
1523	#pragma warning(pop)
1524	#endif
1525	}
1526
1527	template <class charT, class traits>
1528	bool basic_regex_parser<charT, traits>::parse_backref()
1529	{
1530	BOOST_ASSERT(m_position != m_end);
1531	const charT* pc = m_position;
1532	int i = this->m_traits.toi(pc, pc + 1, 10);
1533	if((i == 0) \|\| (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
1534	{
1535	// not a backref at all but an octal escape sequence:
1536	charT c = unescape_character();
1537	this->append_literal(c);
1538	}
1539	else if((i > 0) && (this->m_backrefs & (1u << (i-1))))
1540	{
1541	m_position = pc;
1542	re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
1543	pb->index = i;
1544	}
1545	else
1546	{
1547	fail(regex_constants::error_backref, m_position - m_end);
1548	return false;
1549	}
1550	return true;
1551	}
1552
1553	template <class charT, class traits>
1554	bool basic_regex_parser<charT, traits>::parse_QE()
1555	{
1556	#ifdef BOOST_MSVC
1557	#pragma warning(push)
1558	#pragma warning(disable:4127)
1559	#endif
1560	//
1561	// parse a \Q...\E sequence:
1562	//
1563	++m_position; // skip the Q
1564	const charT* start = m_position;
1565	const charT* end;
1566	do
1567	{
1568	while((m_position != m_end)
1569	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
1570	++m_position;
1571	if(m_position == m_end)
1572	{
1573	// a \Q...\E sequence may terminate with the end of the expression:
1574	end = m_position;
1575	break;
1576	}
1577	if(++m_position == m_end) // skip the escape
1578	{
1579	fail(regex_constants::error_escape, m_position - m_base);
1580	return false;
1581	}
1582	// check to see if it's a \E:
1583	if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
1584	{
1585	++m_position;
1586	end = m_position - 2;
1587	break;
1588	}
1589	// otherwise go round again:
1590	}while(true);
1591	//
1592	// now add all the character between the two escapes as literals:
1593	//
1594	while(start != end)
1595	{
1596	this->append_literal(*start);
1597	++start;
1598	}
1599	return true;
1600	#ifdef BOOST_MSVC
1601	#pragma warning(pop)
1602	#endif
1603	}
1604
1605	template <class charT, class traits>
1606	bool basic_regex_parser<charT, traits>::parse_perl_extension()
1607	{
1608	if(++m_position == m_end)
1609	{
1610	fail(regex_constants::error_badrepeat, m_position - m_base);
1611	return false;
1612	}
1613	//
1614	// treat comments as a special case, as these
1615	// are the only ones that don't start with a leading
1616	// startmark state:
1617	//
1618	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
1619	{
1620	while((m_position != m_end)
1621	&& (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
1622	{}
1623	return true;
1624	}
1625	//
1626	// backup some state, and prepare the way:
1627	//
1628	int markid = 0;
1629	std::ptrdiff_t jump_offset = 0;
1630	re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
1631	std::ptrdiff_t last_paren_start = this->getoffset(pb);
1632	// back up insertion point for alternations, and set new point:
1633	std::ptrdiff_t last_alt_point = m_alt_insert_point;
1634	this->m_pdata->m_data.align();
1635	m_alt_insert_point = this->m_pdata->m_data.size();
1636	std::ptrdiff_t expected_alt_point = m_alt_insert_point;
1637	bool restore_flags = true;
1638	regex_constants::syntax_option_type old_flags = this->flags();
1639	bool old_case_change = m_has_case_change;
1640	m_has_case_change = false;
1641	//
1642	// select the actual extension used:
1643	//
1644	switch(this->m_traits.syntax_type(*m_position))
1645	{
1646	case regex_constants::syntax_colon:
1647	//
1648	// a non-capturing mark:
1649	//
1650	pb->index = markid = 0;
1651	++m_position;
1652	break;
1653	case regex_constants::syntax_equal:
1654	pb->index = markid = -1;
1655	++m_position;
1656	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
1657	this->m_pdata->m_data.align();
1658	m_alt_insert_point = this->m_pdata->m_data.size();
1659	break;
1660	case regex_constants::syntax_not:
1661	pb->index = markid = -2;
1662	++m_position;
1663	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
1664	this->m_pdata->m_data.align();
1665	m_alt_insert_point = this->m_pdata->m_data.size();
1666	break;
1667	case regex_constants::escape_type_left_word:
1668	{
1669	// a lookbehind assertion:
1670	if(++m_position == m_end)
1671	{
1672	fail(regex_constants::error_badrepeat, m_position - m_base);
1673	return false;
1674	}
1675	regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
1676	if(t == regex_constants::syntax_not)
1677	pb->index = markid = -2;
1678	else if(t == regex_constants::syntax_equal)
1679	pb->index = markid = -1;
1680	else
1681	{
1682	fail(regex_constants::error_badrepeat, m_position - m_base);
1683	return false;
1684	}
1685	++m_position;
1686	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
1687	this->append_state(syntax_element_backstep, sizeof(re_brace));
1688	this->m_pdata->m_data.align();
1689	m_alt_insert_point = this->m_pdata->m_data.size();
1690	break;
1691	}
1692	case regex_constants::escape_type_right_word:
1693	//
1694	// an independent sub-expression:
1695	//
1696	pb->index = markid = -3;
1697	++m_position;
1698	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
1699	this->m_pdata->m_data.align();
1700	m_alt_insert_point = this->m_pdata->m_data.size();
1701	break;
1702	case regex_constants::syntax_open_mark:
1703	{
1704	// a conditional expression:
1705	pb->index = markid = -4;
1706	if(++m_position == m_end)
1707	{
1708	fail(regex_constants::error_badrepeat, m_position - m_base);
1709	return false;
1710	}
1711	int v = this->m_traits.toi(m_position, m_end, 10);
1712	if(v > 0)
1713	{
1714	re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
1715	br->index = v;
1716	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
1717	{
1718	fail(regex_constants::error_badrepeat, m_position - m_base);
1719	return false;
1720	}
1721	if(++m_position == m_end)
1722	{
1723	fail(regex_constants::error_badrepeat, m_position - m_base);
1724	return false;
1725	}
1726	}
1727	else
1728	{
1729	// verify that we have a lookahead or lookbehind assert:
1730	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
1731	{
1732	fail(regex_constants::error_badrepeat, m_position - m_base);
1733	return false;
1734	}
1735	if(++m_position == m_end)
1736	{
1737	fail(regex_constants::error_badrepeat, m_position - m_base);
1738	return false;
1739	}
1740	if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
1741	{
1742	if(++m_position == m_end)
1743	{
1744	fail(regex_constants::error_badrepeat, m_position - m_base);
1745	return false;
1746	}
1747	if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
1748	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
1749	{
1750	fail(regex_constants::error_badrepeat, m_position - m_base);
1751	return false;
1752	}
1753	m_position -= 3;
1754	}
1755	else
1756	{
1757	if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
1758	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
1759	{
1760	fail(regex_constants::error_badrepeat, m_position - m_base);
1761	return false;
1762	}
1763	m_position -= 2;
1764	}
1765	}
1766	break;
1767	}
1768	case regex_constants::syntax_close_mark:
1769	fail(regex_constants::error_badrepeat, m_position - m_base);
1770	return false;
1771	default:
1772	//
1773	// lets assume that we have a (?imsx) group and try and parse it:
1774	//
1775	regex_constants::syntax_option_type opts = parse_options();
1776	if(m_position == m_end)
1777	return false;
1778	// make a note of whether we have a case change:
1779	m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
1780	pb->index = markid = 0;
1781	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
1782	{
1783	// update flags and carry on as normal:
1784	this->flags(opts);
1785	restore_flags = false;
1786	old_case_change \|= m_has_case_change; // defer end of scope by one ')'
1787	}
1788	else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
1789	{
1790	// update flags and carry on until the matching ')' is found:
1791	this->flags(opts);
1792	++m_position;
1793	}
1794	else
1795	{
1796	fail(regex_constants::error_badrepeat, m_position - m_base);
1797	return false;
1798	}
1799
1800	// finally append a case change state if we need it:
1801	if(m_has_case_change)
1802	{
1803	static_cast<re_case*>(
1804	this->append_state(syntax_element_toggle_case, sizeof(re_case))
1805	)->icase = opts & regbase::icase;
1806	}
1807
1808	}
1809	//
1810	// now recursively add more states, this will terminate when we get to a
1811	// matching ')' :
1812	//
1813	parse_all();
1814	//
1815	// Unwind alternatives:
1816	//
1817	if(0 == unwind_alts(last_paren_start))
1818	return false;
1819	//
1820	// we either have a ')' or we have run out of characters prematurely:
1821	//
1822	if(m_position == m_end)
1823	{
1824	this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end));
1825	return false;
1826	}
1827	BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
1828	++m_position;
1829	//
1830	// restore the flags:
1831	//
1832	if(restore_flags)
1833	{
1834	// append a case change state if we need it:
1835	if(m_has_case_change)
1836	{
1837	static_cast<re_case*>(
1838	this->append_state(syntax_element_toggle_case, sizeof(re_case))
1839	)->icase = old_flags & regbase::icase;
1840	}
1841	this->flags(old_flags);
1842	}
1843	//
1844	// set up the jump pointer if we have one:
1845	//
1846	if(jump_offset)
1847	{
1848	this->m_pdata->m_data.align();
1849	re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
1850	jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
1851	if(this->m_last_state == jmp)
1852	{
1853	// Oops... we didn't have anything inside the assertion:
1854	fail(regex_constants::error_empty, m_position - m_base);
1855	return false;
1856	}
1857	}
1858	//
1859	// verify that if this is conditional expression, that we do have
1860	// an alternative, if not add one:
1861	//
1862	if(markid == -4)
1863	{
1864	re_syntax_base* b = this->getaddress(expected_alt_point);
1865	if(b->type != syntax_element_alt)
1866	{
1867	re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
1868	alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
1869	}
1870	else if(this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
1871	{
1872	fail(regex_constants::error_bad_pattern, m_position - m_base);
1873	return false;
1874	}
1875	}
1876	//
1877	// append closing parenthesis state:
1878	//
1879	pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
1880	pb->index = markid;
1881	this->m_paren_start = last_paren_start;
1882	//
1883	// restore the alternate insertion point:
1884	//
1885	this->m_alt_insert_point = last_alt_point;
1886	//
1887	// and the case change data:
1888	//
1889	m_has_case_change = old_case_change;
1890	return true;
1891	}
1892
1893	template <class charT, class traits>
1894	bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
1895	{
1896	//
1897	// parses an emacs style \sx or \Sx construct.
1898	//
1899	if(++m_position == m_end)
1900	{
1901	fail(regex_constants::error_escape, m_position - m_base);
1902	return false;
1903	}
1904	basic_char_set<charT, traits> char_set;
1905	if(negate)
1906	char_set.negate();
1907
1908	static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
1909
1910	switch(*m_position)
1911	{
1912	case 's':
1913	case ' ':
1914	char_set.add_class(this->m_mask_space);
1915	break;
1916	case 'w':
1917	char_set.add_class(this->m_word_mask);
1918	break;
1919	case '_':
1920	char_set.add_single(digraph<charT>(charT('$')));
1921	char_set.add_single(digraph<charT>(charT('&')));
1922	char_set.add_single(digraph<charT>(charT('*')));
1923	char_set.add_single(digraph<charT>(charT('+')));
1924	char_set.add_single(digraph<charT>(charT('-')));
1925	char_set.add_single(digraph<charT>(charT('_')));
1926	char_set.add_single(digraph<charT>(charT('<')));
1927	char_set.add_single(digraph<charT>(charT('>')));
1928	break;
1929	case '.':
1930	char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
1931	break;
1932	case '(':
1933	char_set.add_single(digraph<charT>(charT('(')));
1934	char_set.add_single(digraph<charT>(charT('[')));
1935	char_set.add_single(digraph<charT>(charT('{')));
1936	break;
1937	case ')':
1938	char_set.add_single(digraph<charT>(charT(')')));
1939	char_set.add_single(digraph<charT>(charT(']')));
1940	char_set.add_single(digraph<charT>(charT('}')));
1941	break;
1942	case '"':
1943	char_set.add_single(digraph<charT>(charT('"')));
1944	char_set.add_single(digraph<charT>(charT('\'')));
1945	char_set.add_single(digraph<charT>(charT('`')));
1946	break;
1947	case '\'':
1948	char_set.add_single(digraph<charT>(charT('\'')));
1949	char_set.add_single(digraph<charT>(charT(',')));
1950	char_set.add_single(digraph<charT>(charT('#')));
1951	break;
1952	case '<':
1953	char_set.add_single(digraph<charT>(charT(';')));
1954	break;
1955	case '>':
1956	char_set.add_single(digraph<charT>(charT('\n')));
1957	char_set.add_single(digraph<charT>(charT('\f')));
1958	break;
1959	default:
1960	fail(regex_constants::error_ctype, m_position - m_base);
1961	return false;
1962	}
1963	if(0 == this->append_set(char_set))
1964	{
1965	fail(regex_constants::error_ctype, m_position - m_base);
1966	return false;
1967	}
1968	++m_position;
1969	return true;
1970	}
1971
1972	template <class charT, class traits>
1973	regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
1974	{
1975	// we have a (?imsx-imsx) group, convert it into a set of flags:
1976	regex_constants::syntax_option_type f = this->flags();
1977	bool breakout = false;
1978	do
1979	{
1980	switch(*m_position)
1981	{
1982	case 's':
1983	f \|= regex_constants::mod_s;
1984	f &= ~regex_constants::no_mod_s;
1985	break;
1986	case 'm':
1987	f &= ~regex_constants::no_mod_m;
1988	break;
1989	case 'i':
1990	f \|= regex_constants::icase;
1991	break;
1992	case 'x':
1993	f \|= regex_constants::mod_x;
1994	break;
1995	default:
1996	breakout = true;
1997	continue;
1998	}
1999	if(++m_position == m_end)
2000	{
2001	fail(regex_constants::error_paren, m_position - m_base);
2002	return false;
2003	}
2004	}
2005	while(!breakout);
2006
2007	if(*m_position == static_cast<charT>('-'))
2008	{
2009	if(++m_position == m_end)
2010	{
2011	fail(regex_constants::error_paren, m_position - m_base);
2012	return false;
2013	}
2014	do
2015	{
2016	switch(*m_position)
2017	{
2018	case 's':
2019	f &= ~regex_constants::mod_s;
2020	f \|= regex_constants::no_mod_s;
2021	break;
2022	case 'm':
2023	f \|= regex_constants::no_mod_m;
2024	break;
2025	case 'i':
2026	f &= ~regex_constants::icase;
2027	break;
2028	case 'x':
2029	f &= ~regex_constants::mod_x;
2030	break;
2031	default:
2032	breakout = true;
2033	continue;
2034	}
2035	if(++m_position == m_end)
2036	{
2037	fail(regex_constants::error_paren, m_position - m_base);
2038	return false;
2039	}
2040	}
2041	while(!breakout);
2042	}
2043	return f;
2044	}
2045
2046	template <class charT, class traits>
2047	bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
2048	{
2049	//
2050	// If we didn't actually add any states after the last
2051	// alternative then that's an error:
2052	//
2053	if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
2054	&& m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
2055	{
2056	fail(regex_constants::error_empty, this->m_position - this->m_base);
2057	return false;
2058	}
2059	//
2060	// Fix up our alternatives:
2061	//
2062	while(m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
2063	{
2064	//
2065	// fix up the jump to point to the end of the states
2066	// that we've just added:
2067	//
2068	std::ptrdiff_t jump_offset = m_alt_jumps.back();
2069	m_alt_jumps.pop_back();
2070	this->m_pdata->m_data.align();
2071	re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
2072	BOOST_ASSERT(jmp->type == syntax_element_jump);
2073	jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
2074	}
2075	return true;
2076	}
2077
2078	#ifdef BOOST_MSVC
2079	#pragma warning(pop)
2080	#endif
2081
2082	} // namespace re_detail
2083	} // namespace boost
2084
2085	#ifdef BOOST_HAS_ABI_HEADERS
2086	# include BOOST_ABI_SUFFIX
2087	#endif
2088
2089	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats: