Hi list,
I'm profiling some code and as far as I can tell the main problem I have is with the overhead of matrix indexing. My test program is a simple matrix multiplication that uses ublas matrix indexing to access elements, compared to using C style raw pointers.
The test program shows that C style indexing using pointers is much faster. I'm a bit worried because I've always believed that the optimizer was normally smart enough to 'unwrap' C++ code, look inside and determine that it all boils down to simply indexing a pointer. But it seems that this is not true in my test program...
Any hints? Ideas?
I've compiled with all possible optimizations I could think of, but pointers always beat matrix indexing.
#define NDEBUG
#define BOOST_UBLAS_NDEBUG
#include <boost/numeric/ublas/matrix.hpp>
namespace ublas = boost::numeric::ublas;
void indexing(const ublas::matrix<double>& A,
const ublas::matrix<double>& B,
ublas::matrix<double>& C)
{
C = ublas::zero_matrix<double>( A.size1(), B.size2() );
const size_t size1 = A.size1();
const size_t size2 = B.size2();
const size_t size3 = A.size2();
for (size_t i = 0; i < size1; ++i)
for (size_t k = 0; k < size3; ++k)
for (size_t j = 0; j < size2; ++j)
C(i,j) += A(i,k) * B(k,j);
}
void pointers(const ublas::matrix<double>& A,
const ublas::matrix<double>& B,
ublas::matrix<double>& C)
{
C = ublas::zero_matrix<double>( A.size1(), B.size2() );
const double* a = &A(0,0);
const double* b = &B(0,0);
double* c = &C(0,0);
const size_t size1 = A.size1();
const size_t size2 = B.size2();
const size_t size3 = A.size2();
for (size_t i = 0; i < size1; ++i)
for (size_t k = 0; k < size3; ++k)
for (size_t j = 0; j < size2; ++j)
c[i*size2+j] += a[i*size3+k] * b[k*size2+j];
}
const size_t N = 100;
const size_t M = 30000;
int main(int argc, char* argv[])
{
ublas::matrix<double> A(N, N), B(N, M), C;
for (size_t i = 0; i < A.size1(); ++i)
for (size_t j = 0; j < A.size2(); ++j)
A(i,j) = (i + j) / 100.0 + 1 - j;
for (size_t i = 0; i < B.size1(); ++i)
for (size_t j = 0; j < B.size2(); ++j)
B(i,j) = (i - j) / 1000.0 + 1;
if (argc > 1)
pointers(A, B, C);
else
indexing(A, B, C);
return 0;
}