Boost logo

Boost :

Subject: Re: [boost] [Review:Algorithms] Comments about search algorithms
From: Phil Endecott (spam_from_boost_dev_at_[hidden])
Date: 2011-09-26 10:29:17


Phil Endecott wrote:
> - A user faced with these three (or four) algorithms will need to
> choose one. The docs should offer some guidance, e.g. benchmarks or
> rules-of-thumb, about which to use.

In particular, I've not yet found a case where KMP is the best choice.
BM seems better in all the cases that I've tried and has the same or
better complexity bounds. (Well, it might perform slightly better when
the pattern length is 1 but in that case, std::find could be used.)
Does anyone know of any cases where KMP is likely to be better than
BM? (If we can't find any, is it worth including it?)

FYI a benchmark program that I've been using is included below.

Regards, Phil.

#include <iostream>
#include <string>
#include <cstdlib>
#include <algorithm>

#include "boost/algorithm/search.hpp"

using namespace std;

template <typename patIter>
class stdsearcher {

   patIter pat_first, pat_last;

public:
   stdsearcher(patIter first, patIter last):
     pat_first(first), pat_last(last)
   {}

   template <typename corpusIter>
   corpusIter operator()(corpusIter first, corpusIter last) const {
     return std::search(first, last, pat_first, pat_last);
   }

};

template <typename searcher>
int do_search(const string& pattern, const string& corpus)
{
   searcher s (pattern.begin(), pattern.end());
   string::const_iterator i = corpus.begin();
   int n = 0;
   while (1) {
     i = s(i,corpus.end());
     if (i == corpus.end()) break;
     ++n;
     ++i;
   }
   return n;
}

static void usage()
{
   cerr << "Usage: search --stdsearch|--bm|--bmh|--kmp <pattern>\n";
   exit(1);
}

int main(int argc, char* argv[])
{
   if (argc!=3) {
     usage();
   }

   string algo = argv[1];
   string pattern = argv[2];

   bool use_stdsearch = false, use_bm = false, use_bmh = false, use_kmp
= false;

   if (algo=="--stdsearch") {
     use_stdsearch = true;
   } else if (algo=="--bm") {
     use_bm = true;
   } else if (algo=="--bmh") {
     use_bmh = true;
   } else if (algo=="--kmp") {
     use_kmp = true;
   } else {
     usage();
   }

   string corpus;
   while (cin.good()) {
     string l;
     getline(cin,l);
     corpus.append(l);
   }

   int matches = 0;

   for (int round = 0; round<100; ++round) {
     if (use_stdsearch) {
       matches = do_search< stdsearcher<string::const_iterator> >(pattern,corpus);
     }
     if (use_bm) {
       matches = do_search<
boost::algorithm::boyer_moore<string::const_iterator> >(pattern,corpus);
     }
     if (use_bmh) {
       matches = do_search<
boost::algorithm::boyer_moore_horspool<string::const_iterator> >(pattern,corpus);
     }
     if (use_kmp) {
       matches = do_search<
boost::algorithm::knuth_morris_pratt<string::const_iterator> >(pattern,corpus);
     }
   }

   cout << matches << " matches found\n";

   exit(0);
}


Boost list run by bdawes at acm.org, gregod at cs.rpi.edu, cpdaniel at pacbell.net, john at johnmaddock.co.uk