|
Boost-Commit : |
Subject: [Boost-commit] svn:boost r64003 - in sandbox/SOC/2010/stringalgos: boost/algorithm/string boost/algorithm/string/detail boost/algorithm/string/string_search boost/algorithm/string/string_search/detail libs/algorithm/string/test
From: mstefanro_at_[hidden]
Date: 2010-07-13 21:24:21
Author: mstefanro
Date: 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
New Revision: 64003
URL: http://svn.boost.org/trac/boost/changeset/64003
Log:
[GSoC2010][StringAlgo] Boyer-Moore and Suffix Array Search (improvement required on both, latter needs different algorithm for more efficiency); some bugfixes.
Added:
sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search.hpp (contents, props changed)
Text files modified:
sandbox/SOC/2010/stringalgos/boost/algorithm/string/detail/finder.hpp | 46 +++++++
sandbox/SOC/2010/stringalgos/boost/algorithm/string/finder.hpp | 243 ++++++++++++++++++---------------------
sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/boyer_moore.hpp | 122 +++++++++++++++++++
sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/detail/rabin_karp.hpp | 2
sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/knuth_morris_pratt.hpp | 6
sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/naive_search.hpp | 97 +++++++--------
sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/rabin_karp.hpp | 53 +-------
sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/suffix_array.hpp | 199 ++++++++++++++++++++++++++++++++
sandbox/SOC/2010/stringalgos/libs/algorithm/string/test/finder_test.cpp | 23 +++
9 files changed, 552 insertions(+), 239 deletions(-)
Modified: sandbox/SOC/2010/stringalgos/boost/algorithm/string/detail/finder.hpp
==============================================================================
--- sandbox/SOC/2010/stringalgos/boost/algorithm/string/detail/finder.hpp (original)
+++ sandbox/SOC/2010/stringalgos/boost/algorithm/string/detail/finder.hpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -21,9 +21,49 @@
#include <boost/range/empty.hpp>
#include <boost/range/as_literal.hpp>
-namespace boost {
- namespace algorithm {
- namespace detail {
+#include <boost/iterator/iterator_traits.hpp>
+
+#include <boost/mpl/and.hpp>
+#include <boost/type_traits/remove_cv.hpp>
+
+namespace boost { namespace algorithm { namespace detail {
+ template <class T, class U> struct is_pointer_to :
+ boost::mpl::and_<
+ typename boost::is_pointer<T>,
+ typename boost::is_same<
+ typename boost::remove_cv<typename boost::remove_pointer<T>::type>::type,
+ U>
+ > {};
+ template <class Range1T, class Range2T, class ComparatorT, class AllocatorT>
+ struct finder_typedefs
+ {
+ //! The type of the substring
+ typedef Range1T substring_type;
+ //! The type of the string
+ typedef Range2T string_type;
+ //! The type of the comparator
+ typedef ComparatorT comparator_type;
+ //! The type of the allocator
+ typedef AllocatorT allocator_type;
+ //! The type of the substring's iterator
+ typedef typename boost::range_const_iterator<substring_type>::type
+ substring_iterator_type;
+ //! The type of the string's iterator
+ typedef typename boost::range_const_iterator<string_type>::type
+ string_iterator_type;
+ //! The character type of the substring
+ typedef typename boost::iterator_value<substring_iterator_type>::type
+ substring_char_type;
+ //! The character type of the string
+ typedef typename boost::iterator_value<string_iterator_type>::type
+ string_char_type;
+ typedef typename boost::iterator_range<substring_iterator_type>
+ substring_range_type;
+ typedef typename boost::iterator_range<string_iterator_type>
+ string_range_type;
+ typedef typename boost::iterator_difference<string_iterator_type>::type
+ string_difference_type;
+ };
// find first functor -----------------------------------------------//
Modified: sandbox/SOC/2010/stringalgos/boost/algorithm/string/finder.hpp
==============================================================================
--- sandbox/SOC/2010/stringalgos/boost/algorithm/string/finder.hpp (original)
+++ sandbox/SOC/2010/stringalgos/boost/algorithm/string/finder.hpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -45,66 +45,116 @@
for finders provided in this library.
*/
-//! \todo Move this to an appropriate header in detail/
-namespace boost { namespace detail {
- template <class T, class U> struct is_pointer_to :
- boost::mpl::and_<
- typename boost::is_pointer<T>,
- typename boost::is_same<
- typename boost::remove_cv<typename boost::remove_pointer<T>::type>::type,
- U>
- > {};
- template <class Range1T, class Range2T, class ComparatorT, class AllocatorT>
- struct finder_typedefs
- {
- //! The type of the substring
- typedef Range1T substring_type;
- //! The type of the string
- typedef Range2T string_type;
- //! The type of the comparator
- typedef ComparatorT comparator_type;
- //! The type of the allocator
- typedef AllocatorT allocator_type;
- //! The type of the substring's iterator
- typedef typename boost::range_const_iterator<substring_type>::type
- substring_iterator_type;
- //! The type of the string's iterator
- typedef typename boost::range_const_iterator<string_type>::type
- string_iterator_type;
- //! The character type of the substring
- typedef typename boost::iterator_value<substring_iterator_type>::type
- substring_char_type;
- //! The character type of the string
- typedef typename boost::iterator_value<string_iterator_type>::type
- string_char_type;
- typedef typename boost::iterator_range<substring_iterator_type>
- substring_range_type;
- typedef typename boost::iterator_range<string_iterator_type>
- string_range_type;
- typedef typename boost::iterator_difference<string_iterator_type>::type
- string_difference_type;
- };
-} // namespace detail
-} // namespace boost
-
namespace boost { namespace algorithm {
// Finder types ---------------------------------------------//
- template <class,class,class,class> struct finder_no_additional_behavior;
- template <class,class,class,class> class finder_profiling_behavior;
- template <class,class,class,class> class finder_functor_first_finder;
- template <class,class,class,class> class finder_functor_last_finder;
+ template <class,class,class,class,class,class> struct finder_no_additional_behavior;
+ template <class,class,class,class,class,class> class finder_profiling_behavior;
+ template <class,class,class,class,class,class> class finder_functor_first_finder;
+ template <class,class,class,class,class,class> class finder_functor_last_finder;
+ template <class,class,class,class,class,class> class finder_functor_nth_finder;
+
+
+ template <class Range1T, class Range2T, class AlgorithmT,
+ class ComparatorT = ::boost::algorithm_is_equal,
+ class AllocatorT = typename AlgorithmT::default_allocator_type,
+ template <class,class,class,class,class,class> class AdditionalBehaviorT =
+ boost::algorithm::finder_no_additional_behavior
+ >
+ class simplified_finder_t :
+ public boost::algorithm::detail::finder_typedefs<Range1T,Range2T,ComparatorT,AllocatorT>,
+ private AlgorithmT::algorithm<
+ simplified_finder_t<Range1T, Range2T, AlgorithmT, ComparatorT, AllocatorT, AdditionalBehaviorT>,
+ typename boost::range_const_iterator<Range1T>::type,
+ typename boost::range_const_iterator<Range2T>::type,
+ ComparatorT,AllocatorT>
+ {
+ //! \todo Add concept assertions here.
+
+ public:
+ simplified_finder_t(Range1T const &substr, Range2T const &str,
+ ComparatorT comparator = ComparatorT(), AllocatorT allocator = AllocatorT())
+ : substring_range_(substr), string_range_(str),
+ comparator_(comparator), allocator_(allocator),
+ substring_has_changed_(true), string_has_changed_(true),
+ algorithm()
+ { }
+
+ void set_substring (Range1T const &substr)
+ { substring_range_ = substr; substring_has_changed_ = true; }
+
+ void set_string (Range2T const &str)
+ { string_range_ = str; string_has_changed_ = true; }
+
+ void find_reset ()
+ { start_offset_ = boost::begin(string_range_); }
+
+ string_range_type find_first ()
+ {
+ find_reset();
+ return find_next();
+ }
+
+ string_range_type find_next()
+ {
+ apply_changes();
+ if (start_offset_ == boost::end(string_range_))
+ return string_range_type(start_offset_, start_offset_);
+ string_range_type ret =
+ algorithm_type::find(start_offset_);
+ if (boost::begin(ret) == boost::end(string_range_))
+ {
+ start_offset_ = boost::end(string_range_);
+ return ret;
+ }
+ else
+ {
+ start_offset_ = boost::begin(ret);
+ ++start_offset_;
+ return ret;
+ }
+ }
+
+ substring_range_type get_substring_range() const { return substring_range_; }
+ string_range_type get_string_range() const { return string_range_; }
+
+ typename boost::call_traits<comparator_type>::reference get_comparator()
+ { return comparator_; }
+
+ typename boost::call_traits<comparator_type>::const_reference get_comparator() const
+ { return comparator_; }
+
+ //! Gets a reference to the current allocator
+ typename boost::call_traits<allocator_type>::reference get_allocator()
+ { return allocator_; }
+
+ /*!
+ \overload
+ */
+ typename boost::call_traits<allocator_type>::const_reference get_allocator() const
+ { return allocator_; }
+
+ private:
+ substring_range_type substring_range_;
+ string_range_type string_range_;
+ bool substring_has_changed_, string_has_changed_;
- //! \todo copyable finder types?
+ comparator_type comparator_;
+ allocator_type allocator_;
+
+ string_iterator_type start_offset_;
+ };
+
+ //! \todo copyable finder types
//! A generic finder type
/*!
Allows simple use of various string searching algorithms.
\tparam Sequence1T The type of the substring
\tparam Sequence2T The type of the string
- \tparam Algorithm An algorithm policy class
+ \tparam Algorithm An algorithm class
which will be used for performing the searches. \see string_search.hpp
\tparam Comparator Optional template parameter passed to the algorithm.
Used for comparing individual characters for equality.
@@ -116,7 +166,8 @@
class Algorithm,
typename Comparator = ::boost::algorithm::is_equal,
class Allocator = typename Algorithm::default_allocator_type,
- template <class,class,class,class> class AdditionalBehavior = boost::algorithm::finder_no_additional_behavior
+ template <class,class,class,class,class,class> class AdditionalBehavior =
+ boost::algorithm::finder_no_additional_behavior
>
class finder_t : private Algorithm::algorithm</*typename boost::range_const_iterator<Sequence1T>::type,
typename boost::range_const_iterator<Sequence2T>::type,
@@ -125,7 +176,9 @@
typename boost::range_const_iterator<Sequence1T>::type,
typename boost::range_const_iterator<Sequence2T>::type,
Comparator, Allocator>,
- private AdditionalBehavior<Sequence1T,Sequence2T,Algorithm,Comparator>
+ private AdditionalBehavior<
+ typename finder_t<Sequence1T, Sequence2T, Algorithm, Comparator, Allocator, AdditionalBehavior>,
+ Sequence1T,Sequence2T,Algorithm,Comparator,Allocator>
{
//! \todo: Maybe write a String concept?
//! \todo: Currently, there's a SGI Sequence Concept implemented by Boost.ConceptCheck,
@@ -191,7 +244,7 @@
template <class Range2T>
finder_t (const Sequence1T *const substring, const Range2T &string,
Comparator comparator = Comparator(), Allocator allocator = Allocator(),
- typename boost::disable_if<typename ::boost::detail::is_pointer_to<Range2T,Sequence2T> >::type* = 0)
+ typename boost::disable_if<typename ::boost::algorithm::detail::is_pointer_to<Range2T,Sequence2T> >::type* = 0)
: comparator_(comparator), allocator_(allocator),
substring_optional_copy_(), substring_range_(substring?*substring:substring_optional_copy_),
string_optional_copy_(), string_range_(),
@@ -207,7 +260,7 @@
template <class Range1T>
explicit finder_t (const Range1T &substring, const Sequence2T *const string = 0,
Comparator comparator = Comparator(), Allocator allocator = Allocator(),
- typename boost::disable_if<typename ::boost::detail::is_pointer_to<Range1T,Sequence1T> >::type* = 0)
+ typename boost::disable_if<typename ::boost::algorithm::detail::is_pointer_to<Range1T,Sequence1T> >::type* = 0)
: comparator_(comparator), allocator_(allocator),
substring_optional_copy_(), substring_range_(),
string_optional_copy_(), string_range_(string?boost::as_literal(*string):string_optional_copy_),
@@ -223,8 +276,8 @@
template <class Range1T, class Range2T>
finder_t (const Range1T &substring, const Range2T &string,
Comparator comparator = Comparator(), Allocator allocator = Allocator(),
- typename boost::disable_if<boost::mpl::or_<typename ::boost::detail::is_pointer_to<Range1T,Sequence1T>,
- typename ::boost::detail::is_pointer_to<Range2T,Sequence2T> > >::type* = 0)
+ typename boost::disable_if<boost::mpl::or_<typename ::boost::algorithm::detail::is_pointer_to<Range1T,Sequence1T>,
+ typename ::boost::algorithm::detail::is_pointer_to<Range2T,Sequence2T> > >::type* = 0)
: comparator_(comparator), allocator_(allocator),
substring_optional_copy_(), substring_range_(),
string_optional_copy_(), string_range_(),
@@ -257,7 +310,7 @@
Sequence1T const &&substring,
const Range2T &string,
Comparator comparator = Comparator(), Allocator allocator = Allocator(),
- typename boost::disable_if<typename ::boost::detail::is_pointer_to<Range2T,Sequence2T> >::type* = 0)
+ typename boost::disable_if<typename ::boost::algorithm::detail::is_pointer_to<Range2T,Sequence2T> >::type* = 0)
: comparator_(comparator), allocator_(allocator),
substring_optional_copy_(std::move(substring)), substring_range_(substring_optional_copy_),
string_optional_copy_(), string_range_(),
@@ -296,7 +349,7 @@
finder_t (const Range1T &substring,
Sequence2T const &&string,
Comparator comparator = Comparator(), Allocator allocator = Allocator(),
- typename boost::disable_if<typename ::boost::detail::is_pointer_to<Range1T,Sequence1T> >::type* = 0)
+ typename boost::disable_if<typename ::boost::algorithm::detail::is_pointer_to<Range1T,Sequence1T> >::type* = 0)
: comparator_(comparator), allocator_(allocator),
substring_optional_copy_(), substring_range_(),
string_optional_copy_(std::move(string)), string_range_(string_optional_copy_),
@@ -350,7 +403,7 @@
template <typename RangeT>
void set_substring(RangeT const &substring,
typename boost::disable_if<
- typename ::boost::detail::is_pointer_to<RangeT,Sequence1T> >::type* = 0)
+ typename ::boost::algorithm::detail::is_pointer_to<RangeT,Sequence1T> >::type* = 0)
{
boost::iterator_range<typename boost::range_const_iterator<RangeT>::type> substring_range =
boost::as_literal(substring);
@@ -389,7 +442,7 @@
*/
template <typename RangeT>
void set_string(RangeT const &string,
- typename boost::disable_if<typename ::boost::detail::is_pointer_to<RangeT,Sequence2T> >::type* = 0)
+ typename boost::disable_if<typename ::boost::algorithm::detail::is_pointer_to<RangeT,Sequence2T> >::type* = 0)
{
boost::iterator_range<typename boost::range_const_iterator<RangeT>::type> string_range =
boost::as_literal(string);
@@ -421,32 +474,6 @@
}
# endif
- //! Notify the finder that the pointees have changed
- /*! If the substring or the string have been passed as pointers, the finder
- cannot know when the pointees have changed. Therefore, this function
- must be called before any call to find().
- \todo Give it more thought
- */
- void refresh()
- {
- find_reset();
- on_substring_change(); //! \todo should only be called if the substring is a pointer
- on_string_change(); //! \todo should only be called if the string is a pointer
- }
-
- //! \todo doc et al
- void refresh_substring()
- {
- on_substring_change();
- find_reset();
- }
-
- void refresh_string()
- {
- on_string_change();
- find_reset();
- }
-
//! \todo Change the object's substring or just use a temporary one?
//! \todo Maybe this shouldn't be a part of finder_t, but a part of a certain AdditionalBehaviorT
//! Perform a search...
@@ -553,63 +580,19 @@
}
}
}
- //detail::make_properly_copyable<substring_range_type> substring_optional_copy_;
- //detail::make_properly_copyable<string_range_type> string_optional_copy_;
- /*typename std::basic_string<substring_char_type, std::char_traits<substring_char_type>, allocator_type>
- substring_optional_copy_;*/
+
substring_type substring_optional_copy_;
string_type string_optional_copy_;
- /*typename std::basic_string<string_char_type, std::char_traits<string_char_type>, allocator_type>
- string_optional_copy_;*/
comparator_type comparator_;
allocator_type allocator_;
-
- //! \TODO make it so that pointers are stored here when appropriate (or must be 0 otherwise)
- //! these will be used when refresh_substring()/refresh_string() are called
- //! and are used to update substring_range_ and string_range_ via begin()+end()
- //! then those functions will update the ranges of the algorithm and call
- //! on_substring_change()/on_string_change().
- //! \TODO think if it wouldn't be better if the algorithms could somehow use pointers to ranges or some stuff like that.
-
- Sequence1T *substring_pointer_;
- Sequence2T *string_pointer_;
-
- //! \todo Somehow fix this. This makes us able to
- //! only store ranges of type substring_iterator_type, and not of std::basic_string.
substring_range_type substring_range_;
string_range_type string_range_;
string_iterator_type start_offset_;
bool substring_has_changed_, string_has_changed_;
};
- template <class,class,class,class> struct finder_no_additional_behavior { };
+ template <class,class,class,class,class,class> struct finder_no_additional_behavior { };
- /*
- template <class Range1T, class Range2T, class Algorithm,
- class Comparator = ::boost::algorithm_is_equal,
- class Allocator =
- typename Algorithm::algorithm<
- typename boost::range_const_iterator<Range1T>::type,
- typename boost::range_const_iterator<Range2T>::type,
- Comparator>::allocator_type
- >
- class simplified_finder_t :
- public boost::algorithm::detail::finder_typedefs<Range1T,Range2T,ComparatorT,AllocatorT>,
- private Algorithm::algorithm<
- typename boost::range_const_iterator<Sequence1T>::type,
- typename boost::range_const_iterator<Sequence2T>::type,
- Comparator,Allocator>
- {
- //! \todo Add concept assertions here.
-
- public:
- simplified_finder_t(Range1T const *const substr = 0, Range2T const *const str = 0)
- : substr_(substr), str_(str), algorithm() { }
- private:
- Range1T const *const substr_;
- Range2T const *const str_;
- };
- */
// Finder generators ------------------------------------------//
Added: sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search.hpp
==============================================================================
--- (empty file)
+++ sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search.hpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -0,0 +1,12 @@
+#ifndef BOOST_ALGORITHM_STRING_SEARCH_HPP
+#define BOOST_ALGORITHM_STRING_SEARCH_HPP
+
+//! \todo Make sure this header is included by boost/algorithm/string.hpp
+
+#include <boost/algorithm/string/string_search/naive_search.hpp>
+#include <boost/algorithm/string/string_search/rabin_karp.hpp>
+#include <boost/algorithm/string/string_search/knuth_morris_pratt.hpp>
+#include <boost/algorithm/string/string_search/boyer_moore.hpp>
+#include <boost/algorithm/string/string_search/suffix_array.hpp>
+
+#endif
\ No newline at end of file
Modified: sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/boyer_moore.hpp
==============================================================================
--- sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/boyer_moore.hpp (original)
+++ sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/boyer_moore.hpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -3,7 +3,17 @@
#include <iterator>
#include <allocators>
+#include <utility>
#include <vector>
+#include <boost/range/begin.hpp>
+#include <boost/range/end.hpp>
+#include <boost/range/adaptor/reversed.hpp>
+#include <boost/range/algorithm/for_each.hpp>
+#include <boost/range/distance.hpp>
+#include <boost/call_traits.hpp>
+#include <boost/type_traits/is_same.hpp>
+#include <boost/static_assert.hpp>
+#include <map>
namespace boost { namespace algorithm {
struct boyer_moore
@@ -17,11 +27,18 @@
public:
typedef ForwardIterator1T substring_iterator_type;
typedef ForwardIterator2T string_iterator_type;
- typedef typename std::iterator_traits<substring_iterator_type>::value_type substring_char_type;
+ typedef typename
+ std::iterator_traits<substring_iterator_type>::value_type substring_char_type;
typedef typename std::iterator_traits<string_iterator_type>::value_type string_char_type;
typedef typename boost::iterator_range<substring_iterator_type> substring_range_type;
typedef typename boost::iterator_range<string_iterator_type> string_range_type;
+ typedef Comparator comparator_type;
+ typedef Allocator allocator_type;
protected:
+ algorithm () {
+ BOOST_STATIC_ASSERT((boost::is_same<substring_char_type,string_char_type>::value));
+ }
+
string_range_type find(string_iterator_type start)
{
return find(start, std::iterator_traits<ForwardIterator2T>::iterator_category());
@@ -34,25 +51,120 @@
}
//No precomputation to be done on the string
inline void on_string_change()
- {
-
- }
+ { }
private:
- void on_substring_change(std::random_access_iterator_tag)
+ struct compute_first_table_functor {
+ void operator()(typename boost::call_traits<substring_char_type>::param_type c)
+ {
+ //alg_.table1.insert( std::make_pair(c, alg_.substr_size_ - 1 - idx_) );
+ if (idx_ != 0) alg_.table1.insert(std::make_pair(c, idx_));
+
+ ++idx_;
+ }
+ compute_first_table_functor (algorithm &alg) : idx_(0), alg_(alg)
+ { alg_.table1.clear(); }
+ private:
+ std::size_t idx_;
+ algorithm &alg_;
+ };
+
+ struct compute_second_table_functor {
+ void operator()(typename boost::call_traits<substring_char_type>::param_type c)
+ {
+
+ }
+ compute_second_table_functor (algorithm &alg) : idx_(0), alg_(alg)
+ { /*alg_.table2.clear();*/ }
+ private:
+ std::size_t idx_;
+ algorithm &alg_;
+ };
+
+ //precomputation on pattern=bidirectional range
+ void on_substring_change(std::bidirectional_iterator_tag)
{
substring_range_type const &substr = static_cast<Finder*>(this)->get_substring_range();
comparator_type const &comp = static_cast<Finder*>(this)->get_comparator();
+
+ substr_size_ = boost::distance(substr);
+ //Compute the first table
+ boost::for_each(substr | boost::adaptors::reversed,
+ compute_first_table_functor(*this));
+
+ //Compute the second table
+ boost::for_each(substr | boost::adaptors::reversed,
+ compute_second_table_functor(*this));
}
+ //finding in text=random access range
string_range_type find(string_iterator_type start, std::random_access_iterator_tag)
{
string_range_type const &str = static_cast<Finder*>(this)->get_string_range();
substring_range_type const &substr = static_cast<Finder*>(this)->get_substring_range();
comparator_type const &comp = static_cast<Finder*>(this)->get_comparator();
+
+ std::size_t str_idx, substr_idx, str_size;
+
+ if (substr_size_ == 0)
+ return string_range_type(start, start);
+
+ str_size = boost::end(str) - start;
+ for (str_idx = substr_size_ - 1, substr_idx = substr_size_ - 1; str_idx < str_size;)
+ {
+ if (comp(start[str_idx],
+ boost::begin(substr)[substr_idx]))
+ {
+ if (substr_idx == 0)
+ {
+ return string_range_type(start+str_idx,start+str_idx+substr_size_);
+ }
+ else { assert(str_idx > 0); --substr_idx; --str_idx; }
+ }
+ else
+ {
+ table1_type::const_iterator iter = table1.find(start[str_idx]);
+ if (iter == table1.end())
+ {
+ str_idx += substr_size_ - substr_idx;
+ substr_idx = substr_size_ - 1;
+ }
+ else if (iter->second > substr_idx)
+ {
+ str_idx += iter->second;
+ substr_idx = substr_size_ - 1;
+ //str_idx += iter->second - substr_idx + substr_size_ - 1 - substr_idx;
+ //substr_idx = substr_size_ - 1;
+ //substr_idx = substr_size_ - 1 - iter->second;
+ //str_idx += substr_size_ - 1 - substr_idx;
+ //substr_idx = substr_size_ - 1;
+ }
+ else
+ {
+ assert(substr_size_ >= substr_idx);
+ str_idx += substr_size_-substr_idx;
+ substr_idx = substr_size_ - 1;
+ }
+ }
+ }
+ return string_range_type(boost::end(str), boost::end(str));
}
+
+ //!\todo Get a better data structure here (hash table?)
+ //!\todo Find a better way for custom allocators here
+ typedef typename std::map<substring_char_type, std::size_t,
+ std::less<substring_char_type>,
+ typename std::allocator<std::pair<const substring_char_type, std::size_t> >
+ > table1_type;
+
+ table1_type table1;
+
+ std::size_t substr_size_;
+
};
};
} }
+namespace boost { using boost::algorithm::boyer_moore; }
+
#endif
\ No newline at end of file
Modified: sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/detail/rabin_karp.hpp
==============================================================================
--- sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/detail/rabin_karp.hpp (original)
+++ sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/detail/rabin_karp.hpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -9,6 +9,8 @@
#include <boost/range/iterator.hpp>
#include <boost/call_traits.hpp>
#include <iterator>
+#include <boost/range/begin.hpp>
+#include <boost/range/end.hpp>
namespace boost { namespace algorithm { namespace detail {
Modified: sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/knuth_morris_pratt.hpp
==============================================================================
--- sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/knuth_morris_pratt.hpp (original)
+++ sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/knuth_morris_pratt.hpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -3,6 +3,8 @@
#include <iterator>
#include <boost/range/iterator_range.hpp>
+#include <boost/range/begin.hpp>
+#include <boost/range/end.hpp>
#include <vector>
#include <allocators>
@@ -80,6 +82,8 @@
substring_range_type const &substr = static_cast<Finder*>(this)->get_substring_range();
comparator_type const &comp = static_cast<Finder*>(this)->get_comparator();
+ //!\todo clear the container first
+ failure_func.clear();
failure_func.reserve(boost::end(substr) - boost::begin(substr));
std::size_t idx, q, substr_size = boost::end(substr) - boost::begin(substr);
failure_func.push_back(0); failure_func.push_back(0);
@@ -107,4 +111,6 @@
};
} }
+namespace boost { using boost::algorithm::knuth_morris_pratt; }
+
#endif
\ No newline at end of file
Modified: sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/naive_search.hpp
==============================================================================
--- sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/naive_search.hpp (original)
+++ sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/naive_search.hpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -7,68 +7,63 @@
#include <boost/range/begin.hpp>
#include <boost/range/end.hpp>
-namespace boost
-{
- namespace algorithm
+namespace boost { namespace algorithm {
+ //! \todo Copyable
+ struct naive_search
{
- //! \todo Copyable
- struct naive_search
+ typedef boost::mpl::void_ default_allocator_type;
+
+ template <class Finder, class Iterator1T, class Iterator2T, class ComparatorT, class AllocatorT>
+ struct algorithm
{
- typedef boost::mpl::void_ default_allocator_type;
+ protected:
+ algorithm () { }
- template <class Finder, class Iterator1T, class Iterator2T, class ComparatorT, class AllocatorT>
- struct algorithm
- {
- protected:
- algorithm () { }
+ typename boost::iterator_range<Iterator2T> find(Iterator2T start)
+ {
+ typedef typename Finder::string_iterator_type string_iterator_type;
+ typedef typename Finder::substring_iterator_type substring_iterator_type;
+ typedef typename Finder::substring_range_type substring_range_type;
+ typedef typename Finder::string_range_type string_range_type;
+ typedef typename Finder::comparator_type comparator_type;
+ string_range_type const &str = static_cast<Finder*>(this)->get_string_range();
+ substring_range_type const &substr = static_cast<Finder*>(this)->get_substring_range();
+ comparator_type comparator = static_cast<Finder*>(this)->get_comparator();
- typename boost::iterator_range<Iterator2T> find(Iterator2T start)
+ for (;
+ start != boost::end(str); ++start)
{
- typedef typename Finder::string_iterator_type string_iterator_type;
- typedef typename Finder::substring_iterator_type substring_iterator_type;
- typedef typename Finder::substring_range_type substring_range_type;
- typedef typename Finder::string_range_type string_range_type;
- typedef typename Finder::comparator_type comparator_type;
- string_range_type const &str = static_cast<Finder*>(this)->get_string_range();
- substring_range_type const &substr = static_cast<Finder*>(this)->get_substring_range();
- comparator_type comparator = static_cast<Finder*>(this)->get_comparator();
-
- for (;
- start != boost::end(str); ++start)
+ string_iterator_type str_iter(start);
+ substring_iterator_type substr_iter;
+ for (substr_iter = boost::begin(substr);
+ substr_iter != boost::end(substr) && str_iter != boost::end(str);
+ ++substr_iter, ++str_iter)
{
- string_iterator_type str_iter(start);
- substring_iterator_type substr_iter;
- for (substr_iter = boost::begin(substr);
- substr_iter != boost::end(substr) && str_iter != boost::end(str);
- ++substr_iter, ++str_iter)
- {
- if (!comparator(*str_iter, *substr_iter)) break;
- }
- if (substr_iter == boost::end(substr))
- return boost::iterator_range<string_iterator_type>(start, str_iter);
+ if (!comparator(*str_iter, *substr_iter)) break;
}
- return boost::iterator_range<string_iterator_type>(
- boost::end(str),boost::end(str));
+ if (substr_iter == boost::end(substr))
+ return boost::iterator_range<string_iterator_type>(start, str_iter);
}
- //! It is guaranteed that each of these two functions will get called at least once before find()
- //! is used.
- //No precomputation to be done on the substring
- inline void on_substring_change()
- {
- }
- //No precomputation to be done on the string
- inline void on_string_change()
- {
- }
- };
-
+ return boost::iterator_range<string_iterator_type>(
+ boost::end(str),boost::end(str));
+ }
+ //! It is guaranteed that each of these two functions will get called at least once before find()
+ //! is used.
+ //No precomputation to be done on the substring
+ inline void on_substring_change()
+ {
+ }
+ //No precomputation to be done on the string
+ inline void on_string_change()
+ {
+ }
};
+
+ };
- }
+} }
- using algorithm::naive_search;
-
-}
+namespace boost { using boost::algorithm::naive_search; }
#endif
\ No newline at end of file
Modified: sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/rabin_karp.hpp
==============================================================================
--- sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/rabin_karp.hpp (original)
+++ sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/rabin_karp.hpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -9,6 +9,8 @@
#include <boost/mpl/void.hpp>
#include <cstdlib>
#include <boost/static_assert.hpp>
+#include <boost/range/begin.hpp>
+#include <boost/range/end.hpp>
#include <boost/algorithm/string/string_search/detail/rabin_karp.hpp>
@@ -42,59 +44,22 @@
protected:
//construct the algorithm given iterator ranges for the substring and the string
algorithm () {
+ //!\todo add more assertions here
BOOST_STATIC_ASSERT(
sizeof(boost::iterator_value<Iterator1T>::type)*2 <= sizeof(HashType)
);
}
- /*string_range_type find(string_iterator_type start)
- {
- if (start == boost::begin(string) && string_computed_start_offset_) on_string_change();
-
- }
-
- void on_substring_change()
- {
- first_substring_hash_ = static_cast<HashType>(0);
- second_substring_hash_ = static_cast<HashType>(0);
- substring_size_ = 0;
- for (substring_iterator_type it = boost::begin(substring_); it != boost::end(substring_); ++it)
- {
- substring_size_++;
- first_substring_hash_ = (first_substring_hash_*FirstBase + *it) % FirstModulo;
- second_substring_hash_ = (second_substring_hash_*SecondBase + *it) % SecondModulo;
- }
- if (string_computed_start_offset_ > 0) on_string_change();
- }
-
- void on_string_change()
- {
- first_string_hash_ = static_cast<HashType>(0);
- second_string_hash_ = static_cast<HashType>(0);
- string_computed_start_offset_ = 0;
- string_computed_end_offset_ = 0;
- for (string_iterator_type it = boost::begin(string_);
- it != boost::end(string_) && string_computed_end_offset_ < substring_size; ++it)
- {
- ++string_computed_end_offset_;
- first_string_hash_ = (first_string_hash_ * FirstBase + *it) % FirstModulo;
- second_string_hash_ = (second_string_hash_ * SecondBase + *it) % SecondModulo;
- }
-
- //string_computed_upto =
- }*/
-
-
-
+
};
};
- //1/3732152659 odds of collision. used with char
+ //1/3732152659 odds of collision. useful with char
typedef rabin_karp_algorithm<boost::uint32_t,257,64433,277,57923> rabin_karp32;
- //1/(N*M) odds of collision. used with wchar_t
- typedef rabin_karp_algorithm<boost::uint64_t,1,2,3,4> rabin_karp64;
+ //1/150167080229379589 odds of collision. useful with wchar_t
+ typedef rabin_karp_algorithm<boost::uint64_t,337515847,373587883,255150899,401959183> rabin_karp64;
-} // namespace algorithm
-} // namespace boost
+} } // namespace algorithm, namespace boost
+namespace boost { using boost::algorithm::rabin_karp32; using boost::algorithm::rabin_karp64; }
#endif
\ No newline at end of file
Modified: sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/suffix_array.hpp
==============================================================================
--- sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/suffix_array.hpp (original)
+++ sandbox/SOC/2010/stringalgos/boost/algorithm/string/string_search/suffix_array.hpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -1,14 +1,209 @@
#ifndef BOOST_ALGORITHM_SUFFIX_ARRAY_HPP
#define BOOST_ALGORITHM_SUFFIX_ARRAY_HPP
+#include <iterator>
+#include <vector>
+#include <algorithm>
+#include <boost/range/begin.hpp>
+#include <boost/range/end.hpp>
+#include <boost/range/iterator_range.hpp>
+#include <boost/range/algorithm/sort.hpp>
+
namespace boost { namespace algorithm {
- struct suffix_array
+ struct suffix_array_search
{
- template <class
+ typedef std::allocator<std::size_t> default_allocator_type;
+ template <class Finder,class RandomAccessIterator1T,
+ class RandomAccessIterator2T,class Comparator,class Allocator>
class algorithm
{
+ public:
+ typedef RandomAccessIterator1T substring_iterator_type;
+ typedef RandomAccessIterator2T string_iterator_type;
+ typedef typename
+ std::iterator_traits<substring_iterator_type>::value_type substring_char_type;
+ typedef typename std::iterator_traits<string_iterator_type>::value_type string_char_type;
+ typedef typename boost::iterator_range<substring_iterator_type> substring_range_type;
+ typedef typename boost::iterator_range<string_iterator_type> string_range_type;
+ typedef Comparator comparator_type;
+ typedef Allocator allocator_type;
+ protected:
+ algorithm () : found_matches_(false), pos_(), matches_() { }
+
+ void on_substring_change()
+ { on_substring_change(std::iterator_traits<substring_iterator_type>::iterator_category()); }
+
+ void on_string_change()
+ { on_string_change(std::iterator_traits<string_iterator_type>::iterator_category()); }
+
+ string_range_type find (string_iterator_type start)
+ {
+ substring_range_type const &substr = static_cast<Finder*>(this)->get_substring_range();
+ string_range_type const &str = static_cast<Finder*>(this)->get_string_range();
+ std::size_t start_offset = start - boost::begin(str),
+ substr_size = boost::end(substr) - boost::begin(substr),
+ str_size = boost::end(str) - boost::begin(str);
+
+ if (found_matches_)
+ return find_first_match(start_offset);
+
+ if (boost::begin(str) == boost::end(str)) return string_range_type(boost::end(str), boost::end(str));
+
+ std::size_t suffix_left, suffix_right;
+ std::size_t firstsuffix_end = pos_[0] + substr_size, lastsuffix_end = pos_.back()+substr_size;
+ //the end position of the smallest lexicographic suffix
+ //if (firstsuffix_end > str.size()) firstsuffix_end = str.size();
+ //the end position of the biggest lexicographic suffix
+ if (lastsuffix_end > str_size) lastsuffix_end = str_size;
+ //the substring is smaller than the smallest lexicographic suffix, therefore no matches
+ //if (std::lexicographical_compare(substr.begin(), substr.end(),str.begin()+pos[0],str.begin()+firstsuffix_end))
+ if (suffix_less(substr, str, 0) ||
+ std::lexicographical_compare(str.begin()+pos_.back(),str.begin()+lastsuffix_end,substr.begin(),substr.end())
+ )
+ {
+ found_matches_ = true;
+ matches_.clear();
+ return string_range_type( boost::end(str), boost::end(str) );
+ }
+ //the substring is bigger than the biggest lexicographic suffix, therefore no matches
+ //else if ()
+ // return;
+ //perform binary search in the array of suffixes, find a range of matching suffixes
+ else
+ {
+ std::size_t left = 0, right=str_size, middle, result = 0;
+ while (right > left) // nonsingular range [left,right)
+ {
+ middle = (left+right)/2;
+ //does the substring match the prefix of the suffix `middle'?
+ //if (std::equal(substr.begin(), substr.end(), str.begin()+pos[middle]))
+ if (suffix_equal(substr, str, middle))
+ { result = middle; break; }
+ //is the substring lexicographically less than the prefix of the suffix `middle'?
+ else if (suffix_less(substr, str, middle))
+ //else if (std::lexicographical_compare(substr.begin(), substr.end(),
+ // str.begin()+pos[middle], str.end()))
+ { right = middle; }
+ //it must be greater.
+ else { left = middle+1; }
+ }
+ //empty interval, no matches
+ if (left == right) return string_range_type(boost::end(str), boost::end(str));
+
+
+ //suffix_left = lower bound of the matches
+ if (result > 0)
+ {
+ suffix_left = result;
+ //while (suffix_left >= 1 && std::equal(substr.begin(), substr.end(), str.begin()+pos[suffix_left-1]))
+ while (suffix_left >= 1 && suffix_equal(substr, str, suffix_left-1))
+ --suffix_left;
+ //++suffix_left;
+ } else suffix_left = result;
+
+ //suffix_right = upper bound of the matches
+ suffix_right = result+1;
+ //while (suffix_right < str.size() && std::equal(substr.begin(), substr.end(), str.begin()+pos[suffix_right]))
+ while (suffix_right < str_size && suffix_equal(substr, str, suffix_right))
+ ++suffix_right;
+
+ //there are (suffix_right - suffix_left) matches, corresponding to the positions:
+ // { pos[k] | k \in [suffix_left,suffix_right) }
+ found_matches_ = true;
+ matches_.clear();
+ matches_.reserve(suffix_right - suffix_left);
+ for (std::size_t k = suffix_left; k < suffix_right; ++k)
+ matches_.push_back(pos_[k]);
+ boost::sort(matches_);
+ return find_first_match(start_offset);
+ }
+ }
+ private:
+ inline string_range_type find_first_match (std::size_t start_offset)
+ {
+ string_range_type const &str = static_cast<Finder*>(this)->get_string_range();
+ substring_range_type const &substr = static_cast<Finder*>(this)->get_substring_range();
+
+ std::vector<std::size_t, Allocator>::const_iterator first_match =
+ std::lower_bound(matches_.begin(), matches_.end(), start_offset);
+ if (first_match == matches_.end())
+ return string_range_type(
+ boost::end(str), boost::end(str));
+ std::size_t substr_size = boost::end(substr) - boost::begin(substr);
+ assert(*first_match + substr_size <= static_cast<std::size_t>(str.size()) );
+ return string_range_type(
+ boost::begin(str) + (*first_match),
+ boost::begin(str) + (*first_match) + substr_size
+ );
+ }
+ void on_substring_change(std::random_access_iterator_tag)
+ {
+ found_matches_ = false;
+ }
+
+ void on_string_change(std::random_access_iterator_tag)
+ {
+ string_range_type const &str = static_cast<Finder*>(this)->get_string_range();
+
+ // compute the suffix array
+ std::size_t str_size = boost::end(str) - boost::begin(str);
+ pos_.clear();
+ pos_.reserve(str_size);
+ std::generate_n(std::back_inserter(pos_), str_size, increasing_generator());
+ std::sort(pos_.begin(), pos_.end(), suffix_array_sort_comparator(str));
+
+ found_matches_ = false;
+ }
+
+ inline bool suffix_less(substring_range_type const &substr,
+ string_range_type const &str, std::size_t suffix)
+ {
+ std::size_t start_offset = pos_[suffix],
+ end_offset = start_offset + (boost::end(substr) - boost::begin(substr)),
+ str_size = boost::end(str) - boost::begin(str);
+ if (end_offset > str_size) end_offset = str_size;
+ return std::lexicographical_compare(
+ boost::begin(substr), boost::end(substr),
+ boost::begin(str) + start_offset, boost::begin(str) + end_offset);
+ }
+
+ inline bool suffix_equal(substring_range_type const &substr,
+ string_range_type const &str, std::size_t suffix)
+ {
+ std::size_t start_offset = pos_[suffix],
+ end_offset = start_offset + (boost::end(substr) - boost::begin(substr)),
+ str_size = boost::end(str) - boost::begin(str);
+ //the substr is longer than the suffix, they can't be equal.
+ if (end_offset > str_size) return false;
+ return std::equal(substr.begin(), substr.end(), boost::begin(str)+start_offset);
+ }
+
+ std::vector<std::size_t, Allocator> pos_;
+ bool found_matches_;
+ std::vector<std::size_t, Allocator> matches_;
+
+
+ struct increasing_generator
+ {
+ increasing_generator () : idx_(0) { }
+ inline std::size_t operator()() { return idx_++; }
+ std::size_t idx_;
+ };
+
+ struct suffix_array_sort_comparator
+ {
+ suffix_array_sort_comparator (string_range_type const &str) : str_(str) { }
+ bool operator()(std::size_t lhs, std::size_t rhs)
+ {
+ return std::lexicographical_compare(str_.begin()+lhs, str_.end(), str_.begin()+rhs, str_.end());
+ }
+ string_range_type const &str_;
+ };
+
};
};
} }
+namespace boost { using boost::algorithm::suffix_array_search; }
+
#endif
\ No newline at end of file
Modified: sandbox/SOC/2010/stringalgos/libs/algorithm/string/test/finder_test.cpp
==============================================================================
--- sandbox/SOC/2010/stringalgos/libs/algorithm/string/test/finder_test.cpp (original)
+++ sandbox/SOC/2010/stringalgos/libs/algorithm/string/test/finder_test.cpp 2010-07-13 21:24:19 EDT (Tue, 13 Jul 2010)
@@ -7,12 +7,14 @@
// See http://www.boost.org for updates, documentation, and revision history.
+
#include <boost/algorithm/string/find.hpp>
#include <boost/algorithm/string/finder.hpp>
#include <boost/algorithm/string/string_search/naive_search.hpp>
#include <boost/algorithm/string/string_search/rabin_karp.hpp>
#include <boost/algorithm/string/string_search/knuth_morris_pratt.hpp>
#include <boost/algorithm/string/string_search/boyer_moore.hpp>
+#include <boost/algorithm/string/string_search/suffix_array.hpp>
#include <boost/algorithm/string/classification.hpp>
#include <boost/config.hpp>
@@ -23,6 +25,11 @@
#include <sstream>
#include <list>
+
+#if !defined(BOOST_HAS_RVALUE_REFS) && !defined(BOOST_NO_RVALUE_REFS)
+#define BOOST_HAS_RVALUE_REFS
+#endif
+
//#define BOOST_TEST_ALTERNATIVE_INIT_API
//#define BOOST_TEST_DYN_LINK
#include <boost/test/unit_test.hpp>
@@ -186,7 +193,6 @@
//!\todo fix
# ifdef BOOST_HAS_RVALUE_REFS
- BOOST_CHECK(false);
f_t f6( (Sequence1T(s1)), &S1); // rvalue, ptr
BOOST_CHECK(boost::equal(f6.get_substring_range(),s1));
BOOST_CHECK(superficial_range_equal(f6.get_string_range(),S1));
@@ -584,13 +590,22 @@
boost::unit_test::test_suite* init_unit_test_suite( int argc, char* argv[] )
{
- // Note: we should test with other allocators and comparators
+
+ /*!\todo test with other allocators and comparators
+ \todo test with different iterator types, depending on what each algorithm supports
+ \todo test matches of empty substring in nonempty string (should return empty range but iterators with proper offset)
+ \todo test the nonmatch of nonempty substring in empty string
+ \todo test the unique match of empty substring in empty string
+ \todo test more strings with consecutive substring matches
+ */
+
typedef boost::mpl::list<
boost::algorithm::naive_search,
boost::algorithm::rabin_karp32,
- boost::algorithm::knuth_morris_pratt/*,
+ boost::algorithm::rabin_karp64,
+ boost::algorithm::knuth_morris_pratt,
boost::algorithm::boyer_moore,
- boost::algorithm::suffix_array*/
+ boost::algorithm::suffix_array_search
> algorithm_list;
boost::unit_test::framework::master_test_suite().add(
BOOST_TEST_CASE_TEMPLATE(test_finders, algorithm_list)
Boost-Commit list run by bdawes at acm.org, david.abrahams at rcn.com, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk