1 | /* |
---|
2 | * |
---|
3 | * Copyright (c) 2002 |
---|
4 | * John Maddock |
---|
5 | * |
---|
6 | * Use, modification and distribution are subject to the |
---|
7 | * Boost Software License, Version 1.0. (See accompanying file |
---|
8 | * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
---|
9 | * |
---|
10 | */ |
---|
11 | |
---|
12 | #ifndef BOOST_REGEX_MATCHER_HPP |
---|
13 | #define BOOST_REGEX_MATCHER_HPP |
---|
14 | |
---|
15 | #include <boost/regex/v4/iterator_category.hpp> |
---|
16 | |
---|
17 | #ifdef BOOST_HAS_ABI_HEADERS |
---|
18 | # include BOOST_ABI_PREFIX |
---|
19 | #endif |
---|
20 | |
---|
21 | namespace boost{ |
---|
22 | namespace re_detail{ |
---|
23 | |
---|
24 | // |
---|
25 | // error checking API: |
---|
26 | // |
---|
27 | BOOST_REGEX_DECL void BOOST_REGEX_CALL verify_options(boost::regex_constants::syntax_option_type ef, match_flag_type mf); |
---|
28 | // |
---|
29 | // function can_start: |
---|
30 | // |
---|
31 | template <class charT> |
---|
32 | bool can_start(charT c, const unsigned char* map, unsigned char mask) |
---|
33 | { |
---|
34 | return ((c < static_cast<charT>(0)) ? true : ((c >= static_cast<charT>(1 << CHAR_BIT)) ? true : map[c] & mask)); |
---|
35 | } |
---|
36 | inline bool can_start(char c, const unsigned char* map, unsigned char mask) |
---|
37 | { |
---|
38 | return map[(unsigned char)c] & mask; |
---|
39 | } |
---|
40 | inline bool can_start(signed char c, const unsigned char* map, unsigned char mask) |
---|
41 | { |
---|
42 | return map[(unsigned char)c] & mask; |
---|
43 | } |
---|
44 | inline bool can_start(unsigned char c, const unsigned char* map, unsigned char mask) |
---|
45 | { |
---|
46 | return map[c] & mask; |
---|
47 | } |
---|
48 | inline bool can_start(unsigned short c, const unsigned char* map, unsigned char mask) |
---|
49 | { |
---|
50 | return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask); |
---|
51 | } |
---|
52 | #if !defined(__HP_aCC) |
---|
53 | #if defined(WCHAR_MIN) && (WCHAR_MIN == 0) && !defined(BOOST_NO_INTRINSIC_WCHAR_T) |
---|
54 | inline bool can_start(wchar_t c, const unsigned char* map, unsigned char mask) |
---|
55 | { |
---|
56 | return ((c >= (1 << CHAR_BIT)) ? true : map[c] & mask); |
---|
57 | } |
---|
58 | #endif |
---|
59 | #endif |
---|
60 | |
---|
61 | |
---|
62 | // |
---|
63 | // Unfortunately Rogue Waves standard library appears to have a bug |
---|
64 | // in std::basic_string::compare that results in eroneous answers |
---|
65 | // in some cases (tested with Borland C++ 5.1, Rogue Wave lib version |
---|
66 | // 0x020101) the test case was: |
---|
67 | // {39135,0} < {0xff,0} |
---|
68 | // which succeeds when it should not. |
---|
69 | // |
---|
70 | #ifndef _RWSTD_VER |
---|
71 | #if !BOOST_WORKAROUND(BOOST_MSVC, < 1310) |
---|
72 | template <class C, class T, class A> |
---|
73 | inline int string_compare(const std::basic_string<C,T,A>& s, const C* p) |
---|
74 | { |
---|
75 | if(0 == *p) |
---|
76 | { |
---|
77 | if(s.empty() || ((s.size() == 1) && (s[0] == 0))) |
---|
78 | return 0; |
---|
79 | } |
---|
80 | return s.compare(p); |
---|
81 | } |
---|
82 | #endif |
---|
83 | #else |
---|
84 | #if !BOOST_WORKAROUND(BOOST_MSVC, < 1310) |
---|
85 | template <class C, class T, class A> |
---|
86 | inline int string_compare(const std::basic_string<C,T,A>& s, const C* p) |
---|
87 | { |
---|
88 | if(0 == *p) |
---|
89 | { |
---|
90 | if(s.empty() || ((s.size() == 1) && (s[0] == 0))) |
---|
91 | return 0; |
---|
92 | } |
---|
93 | return s.compare(p); |
---|
94 | } |
---|
95 | #endif |
---|
96 | inline int string_compare(const std::string& s, const char* p) |
---|
97 | { return std::strcmp(s.c_str(), p); } |
---|
98 | # ifndef BOOST_NO_WREGEX |
---|
99 | inline int string_compare(const std::wstring& s, const wchar_t* p) |
---|
100 | { return std::wcscmp(s.c_str(), p); } |
---|
101 | #endif |
---|
102 | #endif |
---|
103 | template <class Seq, class C> |
---|
104 | inline int string_compare(const Seq& s, const C* p) |
---|
105 | { |
---|
106 | std::size_t i = 0; |
---|
107 | while((i < s.size()) && (p[i] == s[i])) |
---|
108 | { |
---|
109 | ++i; |
---|
110 | } |
---|
111 | return (i == s.size()) ? -p[i] : s[i] - p[i]; |
---|
112 | } |
---|
113 | # define STR_COMP(s,p) string_compare(s,p) |
---|
114 | |
---|
115 | template<class charT> |
---|
116 | inline const charT* re_skip_past_null(const charT* p) |
---|
117 | { |
---|
118 | while (*p != static_cast<charT>(0)) ++p; |
---|
119 | return ++p; |
---|
120 | } |
---|
121 | |
---|
122 | template <class iterator, class charT, class traits_type, class char_classT> |
---|
123 | iterator BOOST_REGEX_CALL re_is_set_member(iterator next, |
---|
124 | iterator last, |
---|
125 | const re_set_long<char_classT>* set_, |
---|
126 | const regex_data<charT, traits_type>& e, bool icase) |
---|
127 | { |
---|
128 | const charT* p = reinterpret_cast<const charT*>(set_+1); |
---|
129 | iterator ptr; |
---|
130 | unsigned int i; |
---|
131 | //bool icase = e.m_flags & regex_constants::icase; |
---|
132 | |
---|
133 | if(next == last) return next; |
---|
134 | |
---|
135 | typedef typename traits_type::string_type traits_string_type; |
---|
136 | const ::boost::regex_traits_wrapper<traits_type>& traits_inst = *(e.m_ptraits); |
---|
137 | |
---|
138 | // dwa 9/13/00 suppress incorrect MSVC warning - it claims this is never |
---|
139 | // referenced |
---|
140 | (void)traits_inst; |
---|
141 | |
---|
142 | // try and match a single character, could be a multi-character |
---|
143 | // collating element... |
---|
144 | for(i = 0; i < set_->csingles; ++i) |
---|
145 | { |
---|
146 | ptr = next; |
---|
147 | if(*p == static_cast<charT>(0)) |
---|
148 | { |
---|
149 | // treat null string as special case: |
---|
150 | if(traits_inst.translate(*ptr, icase) != *p) |
---|
151 | { |
---|
152 | while(*p == static_cast<charT>(0))++p; |
---|
153 | continue; |
---|
154 | } |
---|
155 | return set_->isnot ? next : (ptr == next) ? ++next : ptr; |
---|
156 | } |
---|
157 | else |
---|
158 | { |
---|
159 | while(*p && (ptr != last)) |
---|
160 | { |
---|
161 | if(traits_inst.translate(*ptr, icase) != *p) |
---|
162 | break; |
---|
163 | ++p; |
---|
164 | ++ptr; |
---|
165 | } |
---|
166 | |
---|
167 | if(*p == static_cast<charT>(0)) // if null we've matched |
---|
168 | return set_->isnot ? next : (ptr == next) ? ++next : ptr; |
---|
169 | |
---|
170 | p = re_skip_past_null(p); // skip null |
---|
171 | } |
---|
172 | } |
---|
173 | |
---|
174 | charT col = traits_inst.translate(*next, icase); |
---|
175 | |
---|
176 | |
---|
177 | if(set_->cranges || set_->cequivalents) |
---|
178 | { |
---|
179 | traits_string_type s1; |
---|
180 | // |
---|
181 | // try and match a range, NB only a single character can match |
---|
182 | if(set_->cranges) |
---|
183 | { |
---|
184 | if((e.m_flags & regex_constants::collate) == 0) |
---|
185 | s1.assign(1, col); |
---|
186 | else |
---|
187 | { |
---|
188 | charT a[2] = { col, charT(0), }; |
---|
189 | s1 = traits_inst.transform(a, a + 1); |
---|
190 | } |
---|
191 | for(i = 0; i < set_->cranges; ++i) |
---|
192 | { |
---|
193 | if(STR_COMP(s1, p) >= 0) |
---|
194 | { |
---|
195 | do{ ++p; }while(*p); |
---|
196 | ++p; |
---|
197 | if(STR_COMP(s1, p) <= 0) |
---|
198 | return set_->isnot ? next : ++next; |
---|
199 | } |
---|
200 | else |
---|
201 | { |
---|
202 | // skip first string |
---|
203 | do{ ++p; }while(*p); |
---|
204 | ++p; |
---|
205 | } |
---|
206 | // skip second string |
---|
207 | do{ ++p; }while(*p); |
---|
208 | ++p; |
---|
209 | } |
---|
210 | } |
---|
211 | // |
---|
212 | // try and match an equivalence class, NB only a single character can match |
---|
213 | if(set_->cequivalents) |
---|
214 | { |
---|
215 | charT a[2] = { col, charT(0), }; |
---|
216 | s1 = traits_inst.transform_primary(a, a +1); |
---|
217 | for(i = 0; i < set_->cequivalents; ++i) |
---|
218 | { |
---|
219 | if(STR_COMP(s1, p) == 0) |
---|
220 | return set_->isnot ? next : ++next; |
---|
221 | // skip string |
---|
222 | do{ ++p; }while(*p); |
---|
223 | ++p; |
---|
224 | } |
---|
225 | } |
---|
226 | } |
---|
227 | if(traits_inst.isctype(col, set_->cclasses) == true) |
---|
228 | return set_->isnot ? next : ++next; |
---|
229 | if((set_->cnclasses != 0) && (traits_inst.isctype(col, set_->cnclasses) == false)) |
---|
230 | return set_->isnot ? next : ++next; |
---|
231 | return set_->isnot ? ++next : next; |
---|
232 | } |
---|
233 | |
---|
234 | template <class BidiIterator> |
---|
235 | class repeater_count |
---|
236 | { |
---|
237 | repeater_count** stack; |
---|
238 | repeater_count* next; |
---|
239 | int id; |
---|
240 | std::size_t count; // the number of iterations so far |
---|
241 | BidiIterator start_pos; // where the last repeat started |
---|
242 | public: |
---|
243 | repeater_count(repeater_count** s) |
---|
244 | { |
---|
245 | stack = s; |
---|
246 | next = 0; |
---|
247 | id = -1; |
---|
248 | count = 0; |
---|
249 | } |
---|
250 | repeater_count(int i, repeater_count** s, BidiIterator start) |
---|
251 | : start_pos(start) |
---|
252 | { |
---|
253 | id = i; |
---|
254 | stack = s; |
---|
255 | next = *stack; |
---|
256 | *stack = this; |
---|
257 | if(id > next->id) |
---|
258 | count = 0; |
---|
259 | else |
---|
260 | { |
---|
261 | repeater_count* p = next; |
---|
262 | while(p->id != id) |
---|
263 | p = p->next; |
---|
264 | count = p->count; |
---|
265 | start_pos = p->start_pos; |
---|
266 | } |
---|
267 | } |
---|
268 | ~repeater_count() |
---|
269 | { |
---|
270 | *stack = next; |
---|
271 | } |
---|
272 | std::size_t get_count() { return count; } |
---|
273 | int get_id() { return id; } |
---|
274 | std::size_t operator++() { return ++count; } |
---|
275 | bool check_null_repeat(const BidiIterator& pos, std::size_t max) |
---|
276 | { |
---|
277 | // this is called when we are about to start a new repeat, |
---|
278 | // if the last one was NULL move our count to max, |
---|
279 | // otherwise save the current position. |
---|
280 | bool result = (count == 0) ? false : (pos == start_pos); |
---|
281 | if(result) |
---|
282 | count = max; |
---|
283 | else |
---|
284 | start_pos = pos; |
---|
285 | return result; |
---|
286 | } |
---|
287 | }; |
---|
288 | |
---|
289 | struct saved_state; |
---|
290 | |
---|
291 | enum saved_state_type |
---|
292 | { |
---|
293 | saved_type_end = 0, |
---|
294 | saved_type_paren = 1, |
---|
295 | saved_type_recurse = 2, |
---|
296 | saved_type_assertion = 3, |
---|
297 | saved_state_alt = 4, |
---|
298 | saved_state_repeater_count = 5, |
---|
299 | saved_state_extra_block = 6, |
---|
300 | saved_state_greedy_single_repeat = 7, |
---|
301 | saved_state_rep_slow_dot = 8, |
---|
302 | saved_state_rep_fast_dot = 9, |
---|
303 | saved_state_rep_char = 10, |
---|
304 | saved_state_rep_short_set = 11, |
---|
305 | saved_state_rep_long_set = 12, |
---|
306 | saved_state_non_greedy_long_repeat = 13, |
---|
307 | saved_state_count = 14 |
---|
308 | }; |
---|
309 | |
---|
310 | #ifdef BOOST_MSVC |
---|
311 | #pragma warning(push) |
---|
312 | #pragma warning(disable : 4251 4231 4660) |
---|
313 | #endif |
---|
314 | |
---|
315 | template <class BidiIterator, class Allocator, class traits> |
---|
316 | class perl_matcher |
---|
317 | { |
---|
318 | public: |
---|
319 | typedef typename traits::char_type char_type; |
---|
320 | typedef perl_matcher<BidiIterator, Allocator, traits> self_type; |
---|
321 | typedef bool (self_type::*matcher_proc_type)(void); |
---|
322 | typedef typename traits::size_type traits_size_type; |
---|
323 | typedef typename is_byte<char_type>::width_type width_type; |
---|
324 | typedef typename regex_iterator_traits<BidiIterator>::difference_type difference_type; |
---|
325 | |
---|
326 | perl_matcher(BidiIterator first, BidiIterator end, |
---|
327 | match_results<BidiIterator, Allocator>& what, |
---|
328 | const basic_regex<char_type, traits>& e, |
---|
329 | match_flag_type f, |
---|
330 | BidiIterator base); |
---|
331 | |
---|
332 | bool match(); |
---|
333 | bool find(); |
---|
334 | |
---|
335 | void setf(match_flag_type f) |
---|
336 | { m_match_flags |= f; } |
---|
337 | void unsetf(match_flag_type f) |
---|
338 | { m_match_flags &= ~f; } |
---|
339 | |
---|
340 | private: |
---|
341 | void construct_init(const basic_regex<char_type, traits>& e, match_flag_type f); |
---|
342 | bool find_imp(); |
---|
343 | bool match_imp(); |
---|
344 | #ifdef BOOST_REGEX_HAS_MS_STACK_GUARD |
---|
345 | typedef bool (perl_matcher::*protected_proc_type)(); |
---|
346 | bool protected_call(protected_proc_type); |
---|
347 | #endif |
---|
348 | void estimate_max_state_count(std::random_access_iterator_tag*); |
---|
349 | void estimate_max_state_count(void*); |
---|
350 | bool match_prefix(); |
---|
351 | bool match_all_states(); |
---|
352 | |
---|
353 | // match procs, stored in s_match_vtable: |
---|
354 | bool match_startmark(); |
---|
355 | bool match_endmark(); |
---|
356 | bool match_literal(); |
---|
357 | bool match_start_line(); |
---|
358 | bool match_end_line(); |
---|
359 | bool match_wild(); |
---|
360 | bool match_match(); |
---|
361 | bool match_word_boundary(); |
---|
362 | bool match_within_word(); |
---|
363 | bool match_word_start(); |
---|
364 | bool match_word_end(); |
---|
365 | bool match_buffer_start(); |
---|
366 | bool match_buffer_end(); |
---|
367 | bool match_backref(); |
---|
368 | bool match_long_set(); |
---|
369 | bool match_set(); |
---|
370 | bool match_jump(); |
---|
371 | bool match_alt(); |
---|
372 | bool match_rep(); |
---|
373 | bool match_combining(); |
---|
374 | bool match_soft_buffer_end(); |
---|
375 | bool match_restart_continue(); |
---|
376 | bool match_long_set_repeat(); |
---|
377 | bool match_set_repeat(); |
---|
378 | bool match_char_repeat(); |
---|
379 | bool match_dot_repeat_fast(); |
---|
380 | bool match_dot_repeat_slow(); |
---|
381 | bool match_backstep(); |
---|
382 | bool match_assert_backref(); |
---|
383 | bool match_toggle_case(); |
---|
384 | #ifdef BOOST_REGEX_RECURSIVE |
---|
385 | bool backtrack_till_match(std::size_t count); |
---|
386 | #endif |
---|
387 | |
---|
388 | // find procs stored in s_find_vtable: |
---|
389 | bool find_restart_any(); |
---|
390 | bool find_restart_word(); |
---|
391 | bool find_restart_line(); |
---|
392 | bool find_restart_buf(); |
---|
393 | bool find_restart_lit(); |
---|
394 | |
---|
395 | private: |
---|
396 | // final result structure to be filled in: |
---|
397 | match_results<BidiIterator, Allocator>& m_result; |
---|
398 | // temporary result for POSIX matches: |
---|
399 | scoped_ptr<match_results<BidiIterator, Allocator> > m_temp_match; |
---|
400 | // pointer to actual result structure to fill in: |
---|
401 | match_results<BidiIterator, Allocator>* m_presult; |
---|
402 | // start of sequence being searched: |
---|
403 | BidiIterator base; |
---|
404 | // end of sequence being searched: |
---|
405 | BidiIterator last; |
---|
406 | // current character being examined: |
---|
407 | BidiIterator position; |
---|
408 | // where to restart next search after failed match attempt: |
---|
409 | BidiIterator restart; |
---|
410 | // where the current search started from, acts as base for $` during grep: |
---|
411 | BidiIterator search_base; |
---|
412 | // how far we can go back when matching lookbehind: |
---|
413 | BidiIterator backstop; |
---|
414 | // the expression being examined: |
---|
415 | const basic_regex<char_type, traits>& re; |
---|
416 | // the expression's traits class: |
---|
417 | const ::boost::regex_traits_wrapper<traits>& traits_inst; |
---|
418 | // the next state in the machine being matched: |
---|
419 | const re_syntax_base* pstate; |
---|
420 | // matching flags in use: |
---|
421 | match_flag_type m_match_flags; |
---|
422 | // how many states we have examined so far: |
---|
423 | difference_type state_count; |
---|
424 | // max number of states to examine before giving up: |
---|
425 | difference_type max_state_count; |
---|
426 | // whether we should ignore case or not: |
---|
427 | bool icase; |
---|
428 | // set to true when (position == last), indicates that we may have a partial match: |
---|
429 | bool m_has_partial_match; |
---|
430 | // set to true whenever we get a match: |
---|
431 | bool m_has_found_match; |
---|
432 | // set to true whenever we're inside an independent sub-expression: |
---|
433 | bool m_independent; |
---|
434 | // the current repeat being examined: |
---|
435 | repeater_count<BidiIterator>* next_count; |
---|
436 | // the first repeat being examined (top of linked list): |
---|
437 | repeater_count<BidiIterator> rep_obj; |
---|
438 | // the mask to pass when matching word boundaries: |
---|
439 | typename traits::char_class_type m_word_mask; |
---|
440 | // the bitmask to use when determining whether a match_any matches a newline or not: |
---|
441 | unsigned char match_any_mask; |
---|
442 | |
---|
443 | #ifdef BOOST_REGEX_NON_RECURSIVE |
---|
444 | // |
---|
445 | // additional members for non-recursive version: |
---|
446 | // |
---|
447 | typedef bool (self_type::*unwind_proc_type)(bool); |
---|
448 | |
---|
449 | void extend_stack(); |
---|
450 | bool unwind(bool); |
---|
451 | bool unwind_end(bool); |
---|
452 | bool unwind_paren(bool); |
---|
453 | bool unwind_recursion_stopper(bool); |
---|
454 | bool unwind_assertion(bool); |
---|
455 | bool unwind_alt(bool); |
---|
456 | bool unwind_repeater_counter(bool); |
---|
457 | bool unwind_extra_block(bool); |
---|
458 | bool unwind_greedy_single_repeat(bool); |
---|
459 | bool unwind_slow_dot_repeat(bool); |
---|
460 | bool unwind_fast_dot_repeat(bool); |
---|
461 | bool unwind_char_repeat(bool); |
---|
462 | bool unwind_short_set_repeat(bool); |
---|
463 | bool unwind_long_set_repeat(bool); |
---|
464 | bool unwind_non_greedy_repeat(bool); |
---|
465 | void destroy_single_repeat(); |
---|
466 | void push_matched_paren(int index, const sub_match<BidiIterator>& sub); |
---|
467 | void push_recursion_stopper(); |
---|
468 | void push_assertion(const re_syntax_base* ps, bool positive); |
---|
469 | void push_alt(const re_syntax_base* ps); |
---|
470 | void push_repeater_count(int i, repeater_count<BidiIterator>** s); |
---|
471 | void push_single_repeat(std::size_t c, const re_repeat* r, BidiIterator last_position, int id); |
---|
472 | void push_non_greedy_repeat(const re_syntax_base* ps); |
---|
473 | |
---|
474 | |
---|
475 | // pointer to base of stack: |
---|
476 | saved_state* m_stack_base; |
---|
477 | // pointer to current stack position: |
---|
478 | saved_state* m_backup_state; |
---|
479 | // determines what value to return when unwinding from recursion, |
---|
480 | // allows for mixed recursive/non-recursive algorithm: |
---|
481 | bool m_recursive_result; |
---|
482 | // how many memory blocks have we used up?: |
---|
483 | unsigned used_block_count; |
---|
484 | #endif |
---|
485 | |
---|
486 | // these operations aren't allowed, so are declared private, |
---|
487 | // bodies are provided to keep explicit-instantiation requests happy: |
---|
488 | perl_matcher& operator=(const perl_matcher&) |
---|
489 | { |
---|
490 | return *this; |
---|
491 | } |
---|
492 | perl_matcher(const perl_matcher& that) |
---|
493 | : m_result(that.m_result), re(that.re), traits_inst(that.traits_inst), rep_obj(0) {} |
---|
494 | }; |
---|
495 | |
---|
496 | #ifdef BOOST_MSVC |
---|
497 | #pragma warning(pop) |
---|
498 | #endif |
---|
499 | |
---|
500 | } // namespace re_detail |
---|
501 | |
---|
502 | #ifdef BOOST_HAS_ABI_HEADERS |
---|
503 | # include BOOST_ABI_SUFFIX |
---|
504 | #endif |
---|
505 | |
---|
506 | } // namespace boost |
---|
507 | |
---|
508 | // |
---|
509 | // include the implementation of perl_matcher: |
---|
510 | // |
---|
511 | #ifdef BOOST_REGEX_RECURSIVE |
---|
512 | #include <boost/regex/v4/perl_matcher_recursive.hpp> |
---|
513 | #else |
---|
514 | #include <boost/regex/v4/perl_matcher_non_recursive.hpp> |
---|
515 | #endif |
---|
516 | // this one has to be last: |
---|
517 | #include <boost/regex/v4/perl_matcher_common.hpp> |
---|
518 | |
---|
519 | #endif |
---|
520 | |
---|