Boost logo

Boost-Commit :

From: eric_at_[hidden]
Date: 2008-01-26 14:38:46


Author: eric_niebler
Date: 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
New Revision: 42986
URL: http://svn.boost.org/trac/boost/changeset/42986

Log:
optimize repeated searches with patterns that have leading repeats
Text files modified:
   trunk/boost/xpressive/detail/core/finder.hpp | 22 +++++++++++++
   trunk/boost/xpressive/detail/core/linker.hpp | 26 ++++++++++++++++
   trunk/boost/xpressive/detail/core/matcher/simple_repeat_matcher.hpp | 26 ++++++++++++++++
   trunk/boost/xpressive/detail/core/optimize.hpp | 9 ++++
   trunk/boost/xpressive/detail/core/peeker.hpp | 65 +++++++++++++++++++++------------------
   trunk/boost/xpressive/detail/core/state.hpp | 2 +
   trunk/boost/xpressive/detail/static/transforms/as_quantifier.hpp | 6 +-
   trunk/boost/xpressive/regex_algorithms.hpp | 4 +-
   trunk/boost/xpressive/regex_iterator.hpp | 9 +++-
   trunk/boost/xpressive/regex_token_iterator.hpp | 12 ++++---
   10 files changed, 137 insertions(+), 44 deletions(-)

Modified: trunk/boost/xpressive/detail/core/finder.hpp
==============================================================================
--- trunk/boost/xpressive/detail/core/finder.hpp (original)
+++ trunk/boost/xpressive/detail/core/finder.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -185,6 +185,28 @@
     bool bits_[256];
 };
 
+///////////////////////////////////////////////////////////////////////////////
+// leading_simple_repeat_finder
+//
+template<typename BidiIter>
+struct leading_simple_repeat_finder
+ : finder<BidiIter>
+{
+ leading_simple_repeat_finder()
+ : finder<BidiIter>()
+ {}
+
+ bool operator ()(match_state<BidiIter> &state) const
+ {
+ state.cur_ = state.next_search_;
+ return true;
+ }
+
+private:
+ leading_simple_repeat_finder(leading_simple_repeat_finder const &);
+ leading_simple_repeat_finder &operator =(leading_simple_repeat_finder const &);
+};
+
 }}}
 
 #if defined(_MSC_VER) && (_MSC_VER >= 1020)

Modified: trunk/boost/xpressive/detail/core/linker.hpp
==============================================================================
--- trunk/boost/xpressive/detail/core/linker.hpp (original)
+++ trunk/boost/xpressive/detail/core/linker.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -145,6 +145,7 @@
       : back_stack_()
       , traits_(&traits)
       , traits_type_(&typeid(Traits))
+ , has_backrefs_(false)
     {
     }
 
@@ -154,6 +155,24 @@
         // no-op
     }
 
+ template<typename Traits, typename ICase>
+ void accept(mark_matcher<Traits, ICase> const &, void const *)
+ {
+ this->has_backrefs_ = true;
+ }
+
+ template<typename Action>
+ void accept(action_matcher<Action> const &, void const *)
+ {
+ this->has_backrefs_ = true;
+ }
+
+ template<typename Predicate>
+ void accept(predicate_matcher<Predicate> const &, void const *)
+ {
+ this->has_backrefs_ = true;
+ }
+
     void accept(repeat_begin_matcher const &, void const *next)
     {
         this->back_stack_.push(next);
@@ -217,6 +236,12 @@
         matcher.xpr_.link(*this);
     }
 
+ // accessors
+ bool has_backrefs() const
+ {
+ return this->has_backrefs_;
+ }
+
     // for use by alt_link_pred below
     template<typename Xpr>
     void alt_branch_link(Xpr const &xpr, void const *next, xpression_peeker<Char> *peeker)
@@ -292,6 +317,7 @@
     std::stack<void const *> back_stack_;
     void const *traits_;
     std::type_info const *traits_type_;
+ bool has_backrefs_;
 };
 
 }}} // namespace boost::xpressive::detail

Modified: trunk/boost/xpressive/detail/core/matcher/simple_repeat_matcher.hpp
==============================================================================
--- trunk/boost/xpressive/detail/core/matcher/simple_repeat_matcher.hpp (original)
+++ trunk/boost/xpressive/detail/core/matcher/simple_repeat_matcher.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -16,6 +16,7 @@
 #include <boost/assert.hpp>
 #include <boost/mpl/if.hpp>
 #include <boost/mpl/bool.hpp>
+#include <boost/next_prior.hpp>
 #include <boost/xpressive/detail/detail_fwd.hpp>
 #include <boost/xpressive/detail/core/quant_style.hpp>
 #include <boost/xpressive/detail/core/state.hpp>
@@ -65,12 +66,14 @@
         Xpr xpr_;
         unsigned int min_, max_;
         std::size_t width_;
+ mutable bool leading_;
 
         simple_repeat_matcher(Xpr const &xpr, unsigned int min, unsigned int max, std::size_t width)
           : xpr_(xpr)
           , min_(min)
           , max_(max)
           , width_(width)
+ , leading_(false)
         {
             // it is the job of the parser to make sure this never happens
             BOOST_ASSERT(min <= max);
@@ -101,6 +104,16 @@
                 ++matches;
             }
 
+ // If this repeater is at the front of the pattern, note
+ // how much of the input we consumed so that a repeated search
+ // doesn't have to cover the same ground again.
+ if(this->leading_)
+ {
+ state.next_search_ = (matches && matches < this->max_)
+ ? state.cur_
+ : (tmp == state.end_) ? tmp : boost::next(tmp);
+ }
+
             if(this->min_ > matches)
             {
                 state.cur_ = tmp;
@@ -126,6 +139,7 @@
         template<typename BidiIter, typename Next>
         bool match_(match_state<BidiIter> &state, Next const &next, non_greedy_tag) const
         {
+ BOOST_ASSERT(!this->leading_);
             BidiIter const tmp = state.cur_;
             unsigned int matches = 0;
 
@@ -161,12 +175,24 @@
             // is there enough room?
             if(this->min_ > diff_to_end)
             {
+ if(this->leading_)
+ {
+ // BUGBUG
+ state.next_search_ = boost::next(tmp);
+ }
                 return false;
             }
 
             BidiIter const min_iter = tmp + this->min_;
             state.cur_ += (std::min)((std::size_t)this->max_, diff_to_end);
 
+ if(this->leading_)
+ {
+ state.next_search_ = (diff_to_end && diff_to_end < this->max_)
+ ? state.cur_
+ : (tmp == state.end_) ? tmp : boost::next(tmp);
+ }
+
             for(;; --state.cur_)
             {
                 if(next.match(state))

Modified: trunk/boost/xpressive/detail/core/optimize.hpp
==============================================================================
--- trunk/boost/xpressive/detail/core/optimize.hpp (original)
+++ trunk/boost/xpressive/detail/core/optimize.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -39,6 +39,13 @@
             new line_start_finder<BidiIter, Traits>(traits)
         );
     }
+ else if(peeker.leading_simple_repeat())
+ {
+ return intrusive_ptr<finder<BidiIter> >
+ (
+ new leading_simple_repeat_finder<BidiIter>()
+ );
+ }
     else if(256 != peeker.bitset().count())
     {
         return intrusive_ptr<finder<BidiIter> >
@@ -96,7 +103,7 @@
 
     // "peek" into the compiled regex to see if there are optimization opportunities
     hash_peek_bitset<char_type> bset;
- xpression_peeker<char_type> peeker(bset, traits);
+ xpression_peeker<char_type> peeker(bset, traits, linker.has_backrefs());
     regex->peek(peeker);
 
     // optimization: get the peek chars OR the boyer-moore search string

Modified: trunk/boost/xpressive/detail/core/peeker.hpp
==============================================================================
--- trunk/boost/xpressive/detail/core/peeker.hpp (original)
+++ trunk/boost/xpressive/detail/core/peeker.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -31,28 +31,7 @@
 {
 
 ///////////////////////////////////////////////////////////////////////////////
-// peek_next
-// tell whether or not to keep looking for a peek optimization
-template<typename Matcher>
-struct peek_next
- : mpl::bool_<Matcher::width == 0>
-{
-};
-
-template<>
-struct peek_next<mark_begin_matcher>
- : mpl::true_
-{
-};
-
-template<>
-struct peek_next<repeat_begin_matcher>
- : mpl::true_
-{
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// xpression_peeker
+// peeker_string
 //
 template<typename Char>
 struct peeker_string
@@ -91,12 +70,14 @@
 struct xpression_peeker
 {
     template<typename Traits>
- xpression_peeker(hash_peek_bitset<Char> &bset, Traits const &traits)
+ xpression_peeker(hash_peek_bitset<Char> &bset, Traits const &traits, bool has_backrefs = false)
       : bset_(bset)
       , str_()
       , line_start_(false)
       , traits_(0)
       , traits_type_(0)
+ , leading_simple_repeat_(0)
+ , has_backrefs_(has_backrefs)
     {
         this->set_traits(traits);
     }
@@ -113,6 +94,11 @@
         return this->line_start_;
     }
 
+ bool leading_simple_repeat() const
+ {
+ return 0 < this->leading_simple_repeat_;
+ }
+
     hash_peek_bitset<Char> const &bitset() const
     {
         return this->bset_;
@@ -120,19 +106,31 @@
 
     ///////////////////////////////////////////////////////////////////////////////
     // modifiers
- void fail(bool do_fail = true)
+ void fail()
     {
- if(do_fail)
+ this->bset_.set_all();
+ }
+
+ template<typename Matcher>
+ mpl::false_ accept(Matcher const &)
+ {
+ this->fail();
+ return mpl::false_();
+ }
+
+ mpl::true_ accept(mark_begin_matcher const &)
+ {
+ if(this->has_backrefs_)
         {
- this->bset_.set_all();
+ --this->leading_simple_repeat_;
         }
+ return mpl::true_();
     }
 
- template<typename Matcher>
- peek_next<Matcher> accept(Matcher const &)
+ mpl::true_ accept(repeat_begin_matcher const &)
     {
- this->fail(!peek_next<Matcher>::value);
- return peek_next<Matcher>();
+ --this->leading_simple_repeat_;
+ return mpl::true_();
     }
 
     template<typename Traits>
@@ -228,6 +226,11 @@
     template<typename Xpr, typename Greedy>
     mpl::false_ accept(simple_repeat_matcher<Xpr, Greedy> const &xpr)
     {
+ if(Greedy() && 1U == xpr.width_)
+ {
+ ++this->leading_simple_repeat_;
+ xpr.leading_ = this->leading_simple_repeat();
+ }
         0 != xpr.min_ ? xpr.xpr_.peek(*this) : this->fail(); // could be a union of xpr and next
         return mpl::false_();
     }
@@ -268,6 +271,8 @@
     bool line_start_;
     void const *traits_;
     std::type_info const *traits_type_;
+ int leading_simple_repeat_;
+ bool has_backrefs_;
 };
 
 }}} // namespace boost::xpressive::detail

Modified: trunk/boost/xpressive/detail/core/state.hpp
==============================================================================
--- trunk/boost/xpressive/detail/core/state.hpp (original)
+++ trunk/boost/xpressive/detail/core/state.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -127,6 +127,7 @@
     actionable const **action_list_tail_;
     action_args_type *action_args_;
     attr_context attr_context_;
+ BidiIter next_search_;
 
     ///////////////////////////////////////////////////////////////////////////////
     //
@@ -151,6 +152,7 @@
       , action_list_tail_(&action_list_.next)
       , action_args_(&core_access<BidiIter>::get_action_args(what))
       , attr_context_() // zero-initializes the fields of attr_context_
+ , next_search_(begin)
     {
         // reclaim any cached memory in the match_results struct
         this->extras_->sub_match_stack_.unwind();

Modified: trunk/boost/xpressive/detail/static/transforms/as_quantifier.hpp
==============================================================================
--- trunk/boost/xpressive/detail/static/transforms/as_quantifier.hpp (original)
+++ trunk/boost/xpressive/detail/static/transforms/as_quantifier.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -88,7 +88,7 @@
             typedef typename Expr::proto_tag tag;
 
             xpr_type const &xpr = Grammar()(proto::arg(expr), detail::true_xpression(), visitor);
- matcher_type matcher(xpr, min_type<tag>(), max_type<tag>(), xpr.get_width().value());
+ matcher_type matcher(xpr, (uint_t)min_type<tag>(), (uint_t)max_type<tag>(), xpr.get_width().value());
             return proto::terminal<matcher_type>::type::make(matcher);
         }
     };
@@ -174,8 +174,8 @@
             int mark_number = proto::arg(proto::left(marked_sub)).mark_number_;
             BOOST_ASSERT(0 != mark_number);
 
- unsigned min_ = min_type<typename Expr::proto_tag>();
- unsigned max_ = max_type<typename Expr::proto_tag>();
+ uint_t min_ = (uint_t)min_type<typename Expr::proto_tag>();
+ uint_t max_ = (uint_t)max_type<typename Expr::proto_tag>();
 
             detail::repeat_begin_matcher begin(mark_number);
             detail::repeat_end_matcher<Greedy> end(mark_number, min_, max_);

Modified: trunk/boost/xpressive/regex_algorithms.hpp
==============================================================================
--- trunk/boost/xpressive/regex_algorithms.hpp (original)
+++ trunk/boost/xpressive/regex_algorithms.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -537,7 +537,7 @@
         }
 
         out = what.format(out, fmt, flags);
- cur = state.cur_ = what[0].second;
+ cur = state.cur_ = state.next_search_ = what[0].second;
 
         if(0 == (flags & format_first_only))
         {
@@ -552,7 +552,7 @@
 
                 access::set_prefix_suffix(what, begin, end);
                 out = what.format(out, fmt, flags);
- cur = state.cur_ = what[0].second;
+ cur = state.cur_ = state.next_search_ = what[0].second;
                 not_null = (0 == what.length());
                 state.reset(what, *access::get_regex_impl(re));
             }

Modified: trunk/boost/xpressive/regex_iterator.hpp
==============================================================================
--- trunk/boost/xpressive/regex_iterator.hpp (original)
+++ trunk/boost/xpressive/regex_iterator.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -39,6 +39,7 @@
         BidiIter begin
       , BidiIter cur
       , BidiIter end
+ , BidiIter next_search
       , basic_regex<BidiIter> const *rex
       , regex_constants::match_flag_type flags
       , bool not_null = false
@@ -50,6 +51,7 @@
       , not_null_(not_null)
     {
         this->state_.cur_ = cur;
+ this->state_.next_search_ = next_search;
     }
 
     bool next()
@@ -63,7 +65,7 @@
         // Report position() correctly by setting the base different from prefix().first
         access::set_base(this->what_, this->state_.begin_);
 
- this->state_.cur_ = this->what_[0].second;
+ this->state_.cur_ = this->state_.next_search_ = this->what_[0].second;
         this->not_null_ = (0 == this->what_.length());
 
         return true;
@@ -116,7 +118,7 @@
       , basic_regex<BidiIter> const &rex
       , regex_constants::match_flag_type flags = regex_constants::match_default
     )
- : impl_(new impl_type_(begin, begin, end, &rex, flags))
+ : impl_(new impl_type_(begin, begin, end, begin, &rex, flags))
     {
         this->next_();
     }
@@ -130,7 +132,7 @@
       , detail::let_<LetExpr> const &args
       , regex_constants::match_flag_type flags = regex_constants::match_default
     )
- : impl_(new impl_type_(begin, begin, end, &rex, flags))
+ : impl_(new impl_type_(begin, begin, end, begin, &rex, flags))
     {
         detail::bind_args(args, this->impl_->what_);
         this->next_();
@@ -222,6 +224,7 @@
                 that->state_.begin_
               , that->state_.cur_
               , that->state_.end_
+ , that->state_.next_search_
               , that->rex_
               , that->flags_
               , that->not_null_

Modified: trunk/boost/xpressive/regex_token_iterator.hpp
==============================================================================
--- trunk/boost/xpressive/regex_token_iterator.hpp (original)
+++ trunk/boost/xpressive/regex_token_iterator.hpp 2008-01-26 14:38:44 EST (Sat, 26 Jan 2008)
@@ -39,13 +39,14 @@
         BidiIter begin
       , BidiIter cur
       , BidiIter end
+ , BidiIter next_search
       , basic_regex<BidiIter> const *rex
       , regex_constants::match_flag_type flags = regex_constants::match_default
       , std::vector<int> subs = std::vector<int>(1, 0)
       , int n = -2
       , bool not_null = false
     )
- : iter_(begin, cur, end, rex, flags, not_null)
+ : iter_(begin, cur, end, next_search, rex, flags, not_null)
       , result_()
       , n_((-2 == n) ? (int)subs.size() - 1 : n)
       , subs_()
@@ -158,7 +159,7 @@
       , BidiIter end
       , basic_regex<BidiIter> const &rex
     )
- : impl_(new impl_type_(begin, begin, end, &rex))
+ : impl_(new impl_type_(begin, begin, end, begin, &rex))
     {
         this->next_();
     }
@@ -176,7 +177,7 @@
       , basic_regex<BidiIter> const &rex
       , detail::let_<LetExpr> const &args
     )
- : impl_(new impl_type_(begin, begin, end, &rex))
+ : impl_(new impl_type_(begin, begin, end, begin, &rex))
     {
         detail::bind_args(args, this->impl_->iter_.what_);
         this->next_();
@@ -198,7 +199,7 @@
       , Subs const &subs
       , regex_constants::match_flag_type flags = regex_constants::match_default
     )
- : impl_(new impl_type_(begin, begin, end, &rex, flags, detail::to_vector(subs)))
+ : impl_(new impl_type_(begin, begin, end, begin, &rex, flags, detail::to_vector(subs)))
     {
         this->next_();
     }
@@ -221,7 +222,7 @@
       , detail::let_<LetExpr> const &args
       , regex_constants::match_flag_type flags = regex_constants::match_default
     )
- : impl_(new impl_type_(begin, begin, end, &rex, flags, detail::to_vector(subs)))
+ : impl_(new impl_type_(begin, begin, end, begin, &rex, flags, detail::to_vector(subs)))
     {
         detail::bind_args(args, this->impl_->iter_.what_);
         this->next_();
@@ -307,6 +308,7 @@
                 this->impl_->iter_.state_.begin_
               , this->impl_->iter_.state_.cur_
               , this->impl_->iter_.state_.end_
+ , this->impl_->iter_.state_.next_search_
               , this->impl_->iter_.rex_
               , this->impl_->iter_.flags_
               , this->impl_->subs_


Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk