|
Ublas : |
Subject: Re: [ublas] [RFC PATCH] ublas: improved dense matrix multiplication performance
From: palik imre (imre_palik_at_[hidden])
Date: 2016-02-20 10:50:59
Hi all,
Any ideas?
If nobody complains about the basic structure, then I'll do all the legwork, and finish this.
Cheers,
Imre
--------------------------------------------
On Sat, 13/2/16, Imre Palik <imre_palik_at_[hidden]> wrote:
Subject: [RFC PATCH] ublas: improved dense matrix multiplication performance
To: david.bellot_at_[hidden]
Cc: ublas_at_[hidden], "Imre Palik" <imre_palik_at_[hidden]>, "Michael Lehn" <michael.lehn_at_[hidden]>
Date: Saturday, 13 February, 2016, 14:43
This patch includes the gemm
implementation from Michael Lehn to
boost::ublas.
This modifies the workings of ublas::prod() and
ublas::axppy_prod()
to use gemm() above a certain matrix size.
This patch only contains the basic architecture, and a
generic c++
implementation. All the architecture, or compiler
specific stuff
will go to follow-up patches.
Signed-off-by: Imre Palik <imre_palik_at_[hidden]>
Cc: Michael Lehn <michael.lehn_at_[hidden]>
---
include/boost/numeric/ublas/detail/gemm.hpp Â
   | 279 ++++++++++++++++++++++
include/boost/numeric/ublas/matrix_expression.hpp |Â
59 ++++-
include/boost/numeric/ublas/operation.hpp Â
    | 87 ++++++-
3 files changed, 407 insertions(+), 18 deletions(-)
create mode 100644
include/boost/numeric/ublas/detail/gemm.hpp
diff --git a/include/boost/numeric/ublas/detail/gemm.hpp
b/include/boost/numeric/ublas/detail/gemm.hpp
new file mode 100644
index 0000000..cb4a343
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -0,0 +1,279 @@
+//
+//Â Copyright (c) 2016
+//Â Michael Lehn, Imre Palik
+//
+//Â Distributed under the Boost Software License,
Version 1.0. (See
+//Â accompanying file LICENSE_1_0.txt or copy at
+//Â http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_GEMM_
+#define _BOOST_UBLAS_GEMM_
+
+#include <boost/type_traits/common_type.hpp>
+#include <boost/align/aligned_alloc.hpp>
+#include <boost/align/assume_aligned.hpp>
+#include <boost/static_assert.hpp>
+
+namespace boost { namespace numeric { namespace ublas {
namespace detail {
+
+Â Â template <typename T>
+Â Â struct prod_block_size {
+Â Â Â Â static const unsigned mc =
256;
+Â Â Â Â static const unsigned kc = 512;
// stripe length
+Â Â Â Â static const unsigned nc =
4092;
+Â Â Â Â static const unsigned mr = 4;
// stripe width for lhs
+Â Â Â Â static const unsigned nr = 12;
// stripe width for rhs
+Â Â Â Â static const unsigned align =
64; // align temporary arrays to this boundary
+Â Â Â Â static const unsigned limit =
26; // Use gemm from this size
+Â Â Â Â BOOST_STATIC_ASSERT_MSG(mc>0
&& kc>0 && nc>0 && mr>0
&& nr>0, "Invalid block size.");
+Â Â Â Â BOOST_STATIC_ASSERT_MSG(mc % mr
== 0, "MC must be a multiple of MR.");
+Â Â Â Â BOOST_STATIC_ASSERT_MSG(nc % nr
== 0, "NC must be a multiple of NR.");
+Â Â };
+
+Â Â template <typename E>
+Â Â void
+Â Â gescal(const typename E::value_type
&alpha, matrix_expression<E> &X)
+Â Â {
+Â Â Â Â typedef typename
E::size_type size_type;
+
+Â Â Â for (size_type i=0; i<X().size1();
++i) {
+Â Â Â Â Â Â for (size_type
j=0; j<X().size2(); ++j) {
+Â Â Â Â Â Â Â X()(i,j) *=
alpha;
+Â Â Â Â Â Â }
+Â Â Â Â }
+Â Â }
+
+Â Â template <typename Index, typename T>
+Â Â void
+Â Â geaxpy(Index m, Index n, const T &alpha,
+Â Â Â Â Â Â Â const T *X,
Index incRowX, Index incColX,
+Â Â Â Â Â Â Â TÂ
    *Y, Index incRowY, Index incColY)
+Â Â {
+Â Â Â Â for (Index j=0; j<n; ++j) {
+Â Â Â Â Â Â for (Index i=0;
i<m; ++i) {
+Â Â Â Â Â Â Â Â
Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
+Â Â Â Â Â Â }
+Â Â Â Â }
+Â Â }
+
+Â Â template <typename Index, typename TX>
+Â Â void
+Â Â gescal(Index m, Index n,
+Â Â Â Â Â Â Â const TX
&alpha,
+Â Â Â Â Â Â Â TX *X, Index
incRowX, Index incColX)
+Â Â {
+Â Â Â Â for (Index j=0; j<n; ++j) {
+Â Â Â Â Â Â for (Index i=0;
i<m; ++i) {
+Â Â Â Â Â Â Â Â
X[i*incRowX+j*incColX] *= alpha;
+Â Â Â Â Â Â }
+Â Â Â Â }
+Â Â }
+
+Â Â //-- Micro Kernel
--------------------------------------------------------------
+Â Â template <typename Index, typename T,
typename TC, typename BlockSize>
+Â Â void
+Â Â ugemm(Index kc, TC alpha, const T *A, const T
*B,
+Â Â Â Â Â TC beta, TC *C, Index
incRowC, Index incColC)
+Â Â {
+Â Â Â Â BOOST_ALIGN_ASSUME_ALIGNED(A,
BlockSize::align);
+Â Â Â Â BOOST_ALIGN_ASSUME_ALIGNED(B,
BlockSize::align);
+Â Â Â Â static const Index MR =
BlockSize::mr;
+Â Â Â Â static const Index NR =
BlockSize::nr;
+Â Â Â typename
boost::aligned_storage<sizeof(T[MR*NR]),alignof(BlockSize::align)>::type
Pa;
+Â Â Â T *P =
reinterpret_cast<T*>(Pa.address());
+Â Â Â for (unsigned c = 0; c < MR * NR;
c++)
+Â Â Â Â P[c] = 0;
+
+Â Â Â Â for (Index l=0; l<kc; ++l)
{
+Â Â Â Â Â for (Index i=0; i<MR;
++i) {
+Â Â Â Â Â Â Â for (Index
j=0; j<NR; ++j) {
+Â Â Â Â Â Â Â Â
  P[i* NR+j] += A[i]*B[j];
+Â Â Â Â Â Â Â Â }
+Â Â Â Â Â Â }
+Â Â Â Â Â A += MR;
+Â Â Â Â Â B += NR;
+Â Â Â Â }
+
+Â Â Â if (alpha!=TC(1)) {
+Â Â Â Â Â for (Index i=0; i<MR;
++i) {
+Â Â Â Â Â Â Â for (Index
j=0; j<NR; ++j) {
+Â Â Â Â Â Â Â Â
P[i*NR+j] *= alpha;
+Â Â Â Â Â Â }
+Â Â Â Â Â }
+Â Â Â }
+
+Â Â Â if (beta == TC(0)) {
+Â Â Â Â Â for (Index i=0; i<MR;
++i) {
+Â Â Â Â Â Â Â for (Index
j=0; j<NR; ++j) {
+Â Â Â Â Â Â Â Â
C[i*incRowC+j*incColC] = P[i*NR+j];
+Â Â Â Â Â Â }
+Â Â Â Â Â }
+Â Â Â } else {
+Â Â Â Â Â for (Index i=0; i<MR;
++i) {
+Â Â Â Â Â Â Â for (Index
j=0; j<NR; ++j) {
+Â Â Â Â Â Â Â Â
  C[i*incRowC+j*incColC] *= beta;
+Â Â Â Â Â Â Â Â
  C[i*incRowC+j*incColC] += P[i*NR+j];
+Â Â Â Â Â Â }
+Â Â Â Â Â }
+Â Â Â }
+Â Â }
+
+Â Â //-- Macro Kernel
--------------------------------------------------------------
+Â Â Â template <typename Index, typename T,
typename TC, typename BlockSize>
+Â Â void
+Â Â mgemm(Index mc, Index nc, Index kc, TC
alpha,
+Â Â Â Â Â const T *A, const T *B,
TC beta,
+Â Â Â Â Â TC *C, Index incRowC,
Index incColC)
+Â Â {
+Â Â Â Â static const Index MR =
BlockSize::mr;
+Â Â Â Â static const Index NR =
BlockSize::nr;
+    const Index mp =
(mc+MR-1) / MR;
+    const Index np =
(nc+NR-1) / NR;
+Â Â Â Â const Index mr_ = mc % MR;
+Â Â Â Â const Index nr_ = nc % NR;
+
+Â Â Â Â for (Index j=0; j<np; ++j)
{
+Â Â Â Â Â Â const Index nr =
(j!=np-1 || nr_==0) ? NR : nr_;
+Â Â Â Â Â Â T C_[MR*NR];
+
+Â Â Â Â Â Â for (Index i=0;
i<mp; ++i) {
+Â Â Â Â Â Â Â Â
const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
+
+Â Â Â Â Â Â Â Â if
(mr==MR && nr==NR) {
+Â Â Â Â Â Â Â
ugemm<Index, T, TC, BlockSize>(kc, alpha,
+Â Â Â Â Â Â Â Â
     &A[i*kc*MR],
&B[j*kc*NR],
+Â Â Â Â Â Â Â Â
     beta,
+Â Â Â Â Â Â Â Â
    Â
&C[i*MR*incRowC+j*NR*incColC],
+Â Â Â Â Â Â Â Â
     incRowC, incColC);
+Â Â Â Â Â Â Â Â }
else {
+Â Â Â Â Â Â Â Â
  std::fill_n(C_, MR*NR, T(0));
+Â Â Â Â Â Â Â Â
  ugemm<Index, T, TC, BlockSize>(kc,
alpha,
+Â Â Â Â Â Â Â Â
     &A[i*kc*MR],
&B[j*kc*NR],
+Â Â Â Â Â Â Â Â
     T(0),
+Â Â Â Â Â Â Â Â
     C_, NR, Index(1));
+Â Â Â Â Â Â Â Â
  gescal(mr, nr, beta,
+Â Â Â Â Â Â Â Â
   Â
   &C[i*MR*incRowC+j*NR*incColC],
+Â Â Â Â Â Â Â Â
       incRowC,
incColC);
+Â Â Â Â Â Â Â Â
  geaxpy(mr, nr, T(1), C_, NR, Index(1),
+Â Â Â Â Â Â Â Â
   Â
   &C[i*MR*incRowC+j*NR*incColC],
+Â Â Â Â Â Â Â Â
       incRowC,
incColC);
+Â Â Â Â Â Â Â Â }
+Â Â Â Â Â Â }
+Â Â Â Â }
+Â Â }
+
+Â Â //-- Packing blocks
------------------------------------------------------------
+Â Â Â template <typename E, typename T,
typename BlockSize>
+Â Â void
+Â Â pack_A(const matrix_expression<E>
&A, T *p)
+Â Â {
+Â Â Â Â typedef typename
E::size_type size_type;
+
+Â Â Â Â const size_type mc = A
().size1();
+Â Â Â Â const size_type kc = A
().size2();
+Â Â Â Â static const size_type MR =
BlockSize::mr;
+Â Â Â Â const size_type mp = (mc+MR-1)
/ MR;
+
+Â Â Â Â for (size_type j=0; j<kc;
++j) {
+Â Â Â Â Â Â for (size_type
l=0; l<mp; ++l) {
+Â Â Â Â Â Â Â Â for
(size_type i0=0; i0<MR; ++i0) {
+Â Â Â Â Â Â Â Â
  size_type i = l*MR + i0;
+Â Â Â Â Â Â Â Â
  size_type nu = l*MR*kc + j*MR + i0;
+Â Â Â Â Â Â Â Â
  p[nu]    = (i<mc) ?
A()(i,j) : T(0);
+Â Â Â Â Â Â Â Â }
+Â Â Â Â Â Â }
+Â Â Â Â }
+Â Â }
+
+Â Â template <typename E, typename T, typename
BlockSize>
+Â Â void
+Â Â pack_B(const matrix_expression<E>
&B, T *p)
+Â Â {
+Â Â Â Â typedef typename
E::size_type size_type;
+
+Â Â Â Â const size_type kc = B
().size1();
+Â Â Â Â const size_type nc = B
().size2();
+Â Â Â Â static const size_type NR =
BlockSize::nr;
+Â Â Â Â const size_type np = (nc+NR-1)
/ NR;
+
+Â Â Â Â for (size_type l=0; l<np;
++l) {
+Â Â Â Â Â Â for (size_type
j0=0; j0<NR; ++j0) {
+Â Â Â Â Â Â Â Â for
(size_type i=0; i<kc; ++i) {
+Â Â Â Â Â Â Â Â
  size_type j = l*NR+j0;
+Â Â Â Â Â Â Â Â
  size_type nu = l*NR*kc + i*NR + j0;
+Â Â Â Â Â Â Â Â
  p[nu]    = (j<nc) ?
B()(i,j) : T(0);
+Â Â Â Â Â Â Â Â }
+Â Â Â Â Â Â }
+Â Â Â Â }
+Â Â }
+
+Â Â //-- Frame routine
-------------------------------------------------------------
+Â Â template <typename E1, typename E2,
typename E3, typename BlockSize>
+Â Â void
+Â Â gemm(typename E3::value_type alpha, const
matrix_expression<E1> &e1,
+Â Â Â Â Â const
matrix_expression<E2> &e2,
+Â Â Â Â Â Â typename
E3::value_type beta, matrix_expression<E3> &e3)
+Â Â {
+Â Â Â Â typedef typename
E3::size_type size_type;
+Â Â Â Â typedef typename E1::value_type
value_type1;
+Â Â Â Â typedef typename E2::value_type
value_type2;
+Â Â Â Â typedef typename E3::value_type
value_type3;
+Â Â Â Â typedef typename
common_type<value_type1, value_type2>::type
value_type_i;
+
+Â Â Â Â static const size_type MC =
BlockSize::mc;
+Â Â Â Â static const size_type NC =
BlockSize::nc;
+
+Â Â Â Â const size_type m =
BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ());
+Â Â Â const size_type n = BOOST_UBLAS_SAME (e3
().size2 (), e2 ().size2 ());
+Â Â Â Â const size_type k =
BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
+
+Â Â Â Â static const size_type KC =
BlockSize::kc;
+Â Â Â Â const size_type mb = (m+MC-1) /
MC;
+Â Â Â Â const size_type nb = (n+NC-1) /
NC;
+Â Â Â Â const size_type kb = (k+KC-1) /
KC;
+Â Â Â Â const size_type mc_ = m % MC;
+Â Â Â Â const size_type nc_ = n % NC;
+Â Â Â Â const size_type kc_ = k % KC;
+
+Â Â Â Â value_type3 *C_ =
&e3()(0,0);
+Â Â Â Â const size_type incRowC =
&e3()(1,0) - &e3()(0,0);
+Â Â Â Â const size_type incColC =
&e3()(0,1) - &e3()(0,0);
+Â Â Â Â value_type_i *A =
+Â Â Â Â Â Â
static_cast<value_type_i
*>(boost::alignment::aligned_alloc(BlockSize::align,
+Â Â Â Â Â Â Â Â Â
        Â
        Â
sizeof(value_type_i) * MC*KC));
+Â Â Â Â value_type_i *B =
+Â Â Â Â Â Â
static_cast<value_type_i
*>(boost::alignment::aligned_alloc(BlockSize::align,
+Â Â Â Â Â Â Â Â Â
        Â
        Â
sizeof(value_type_i) * KC*NC));
+
+Â Â Â Â if (alpha==value_type3(0) ||
k==0) {
+Â Â Â Â Â Â gescal(beta,
e3);
+Â Â Â Â Â Â return;
+Â Â Â Â }
+
+Â Â Â Â for (size_type j=0; j<nb;
++j) {
+Â Â Â Â Â Â size_type nc =
(j!=nb-1 || nc_==0) ? NC : nc_;
+
+Â Â Â Â Â Â for (size_type
l=0; l<kb; ++l) {
+Â Â Â Â Â Â Â Â
size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_;
+Â Â Â Â Â Â Â Â
value_type3 beta_ = (l==0) ? beta : value_type3(1);
+
+Â Â Â Â Â Â Â Â
const matrix_range<const E2> Bs = subrange(e2(), l*KC,
l*KC+kc, j*NC, j*NC+nc);
+Â Â Â Â Â Â Â Â
pack_B<matrix_range<const E2>, value_type_i,
BlockSize>(Bs, B);
+
+Â Â Â Â Â Â Â Â for
(size_type i=0; i<mb; ++i) {
+Â Â Â Â Â Â Â Â
  size_type mc = (i!=mb-1 || mc_==0) ? MC :
mc_;
+
+Â Â Â Â Â Â Â Â
  const matrix_range<const E1> As =
subrange(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc);
+Â Â Â Â Â Â Â Â
  pack_A<matrix_range<const E1>,
value_type_i, BlockSize>(As, A);
+
+Â Â Â Â Â Â Â Â
  mgemm<size_type, value_type_i, value_type3,
BlockSize>(mc, nc, kc, alpha, A, B, beta_,
+Â Â Â Â Â Â Â Â Â
 &C_[i*MC*incRowC+j*NC*incColC],
+Â Â Â Â Â Â Â Â Â
 incRowC, incColC);
+Â Â Â Â Â Â Â Â }
+Â Â Â Â Â Â }
+Â Â Â Â }
+Â Â Â boost::alignment::aligned_free(A);
+Â Â Â boost::alignment::aligned_free(B);
+Â Â }
+}}}}
+#endif
diff --git
a/include/boost/numeric/ublas/matrix_expression.hpp
b/include/boost/numeric/ublas/matrix_expression.hpp
index a363130..22bdb44 100644
--- a/include/boost/numeric/ublas/matrix_expression.hpp
+++ b/include/boost/numeric/ublas/matrix_expression.hpp
@@ -14,6 +14,7 @@
#define _BOOST_UBLAS_MATRIX_EXPRESSION_
#include <boost/numeric/ublas/vector_expression.hpp>
+#include <boost/numeric/ublas/detail/gemm.hpp>
// Expression templates based on ideas of Todd Veldhuizen
and Geoffrey Furnish
// Iterators based on ideas of Jeremy Siek
@@ -5460,20 +5461,40 @@ namespace boost { namespace numeric
{ namespace ublas {
  Â
   expression2_closure_type e2_;
    };
+Â Â namespace detail {
+Â Â Â template<class E1, class E2, class
P, bool s>
+Â Â Â struct binary_calculate_result_type;
+
+Â Â Â template<class E1, class E2, class
P>
+Â Â Â struct
binary_calculate_result_type<E1, E2, P, false> {
+Â Â Â typedef matrix_matrix_binary<E1, E2,
matrix_matrix_prod<E1, E2, P> > result_type;
+Â Â Â };
+
+Â Â Â // TODO: should elaborate on this for
some dense types.
+Â Â Â template<class E1, class E2, class
P>
+Â Â Â struct
binary_calculate_result_type<E1, E2, P, true> {
+Â Â Â typedef matrix<P> result_type;
+Â Â Â };
+Â Â }
+
    template<class T1, class E1,
class T2, class E2>
    struct matrix_matrix_binary_traits
{
-Â Â Â Â typedef unknown_storage_tag
storage_category;
+Â Â Â //Â Â Â Â typedef
unknown_storage_tag storage_category;
+Â Â Â typedef typename
storage_restrict_traits<typename E1::storage_category,
typename E2::storage_category>::storage_category
storage_category;
      typedef
unknown_orientation_tag orientation_category;
      typedef typename
promote_traits<T1, T2>::promote_type promote_type;
      typedef
matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1,
E2, promote_type> > expression_type;
#ifndef BOOST_UBLAS_SIMPLE_ET_DEBUG
-Â Â Â Â typedef expression_type
result_type;
+Â Â Â //Â Â Â Â typedef
expression_type result_type;
+Â Â Â typedef typename
detail::binary_calculate_result_type<E1, E2,
promote_type, boost::is_base_of<dense_proxy_tag,
storage_category>::value>::result_type result_type;
#else
      typedef typename
E1::matrix_temporary_type result_type;
#endif
    };
-Â Â template<class E1, class E2>
+Â Â template<class E1, class E2,
+Â Â Â Â Â Â Â typename B =
detail::prod_block_size<typename common_type<typename
E1::value_type,
+Â Â Â Â Â Â Â Â Â
        Â
       Â
   typename E2::value_type>::type>
>
    BOOST_UBLAS_INLINE
    typename
matrix_matrix_binary_traits<typename E1::value_type, E1,
       Â
       Â
     typename E2::value_type,
E2>::result_type
@@ -5481,13 +5502,41 @@ namespace boost { namespace numeric
{ namespace ublas {
       const
matrix_expression<E2> &e2,
   Â
   unknown_storage_tag,
   Â
   unknown_orientation_tag) {
+
      typedef typename
matrix_matrix_binary_traits<typename E1::value_type, E1,
       Â
       Â
       Â
   typename E2::value_type,
E2>::expression_type expression_type;
      return
expression_type (e1 (), e2 ());
    }
+Â Â template<class E1, class E2,
+Â Â Â Â Â Â Â typename B =
detail::prod_block_size<typename common_type<typename
E1::value_type,
+Â Â Â Â Â Â Â Â Â
        Â
       Â
   typename E2::value_type>::type>
>
+Â Â BOOST_UBLAS_INLINE
+Â Â typename
matrix_matrix_binary_traits<typename E1::value_type, E1,
+Â Â Â Â Â Â Â Â
       Â
      typename
E2::value_type, E2>::result_type
+Â Â prod (const matrix_expression<E1>
&e1,
+Â Â Â Â Â const
matrix_expression<E2> &e2,
+Â Â Â Â Â dense_proxy_tag,
+Â Â Â Â Â unknown_orientation_tag)
{
+Â Â Â Â typedef typename
matrix_matrix_binary_traits<typename E1::value_type, E1,
+Â Â Â Â Â Â Â Â
       Â
       Â
    typename E2::value_type,
E2>::expression_type expression_type;
+Â Â Â typedef typename
E1::matrix_temporary_type result_type;
+Â Â Â typedef typename result_type::value_type
result_value;
+
+Â Â Â if (e1 ().size1() < B::limit || e2
().size2() < B::limit) {
+Â Â Â Â Â return expression_type (e1
(), e2 ());
+Â Â Â } else {
+Â Â Â Â Â Â result_type rv(e1
().size1(), e2 ().size2());
+ Â Â Â Â Â detail::gemm<E1, E2,
result_type, B>(result_value(1), e1, e2,
+Â Â Â Â Â Â
     result_value(0), rv);
+Â Â Â Â Â return rv;
+Â Â Â }
+Â Â }
+
    // Dispatcher
-Â Â template<class E1, class E2>
+Â Â template<class E1, class E2,
+Â Â Â Â Â Â Â typename B =
detail::prod_block_size<typename common_type<typename
E1::value_type,
+Â Â Â Â Â Â Â Â Â
        Â
       Â
   typename E2::value_type>::type>
>
    BOOST_UBLAS_INLINE
    typename
matrix_matrix_binary_traits<typename E1::value_type, E1,
       Â
       Â
     typename E2::value_type,
E2>::result_type
@@ -5498,7 +5547,7 @@ namespace boost { namespace numeric {
namespace ublas {
       Â
       Â
       Â
   typename E2::value_type,
E2>::storage_category storage_category;
      typedef typename
matrix_matrix_binary_traits<typename E1::value_type, E1,
       Â
       Â
       Â
   typename E2::value_type,
E2>::orientation_category orientation_category;
-Â Â Â Â return prod (e1, e2,
storage_category (), orientation_category ());
+Â Â Â Â return prod<E1, E2, B>
(e1, e2, storage_category (), orientation_category ());
    }
    template<class E1, class E2>
diff --git a/include/boost/numeric/ublas/operation.hpp
b/include/boost/numeric/ublas/operation.hpp
index 64657cc..80bfab6 100644
--- a/include/boost/numeric/ublas/operation.hpp
+++ b/include/boost/numeric/ublas/operation.hpp
@@ -14,7 +14,7 @@
#define _BOOST_UBLAS_OPERATION_
#include <boost/numeric/ublas/matrix_proxy.hpp>
-
+#include <boost/numeric/ublas/detail/gemm.hpp>
/** \file operation.hpp
 * \brief This file contains some specialized
products.
 */
@@ -637,13 +637,45 @@ namespace boost { namespace numeric {
namespace ublas {
      return m;
    }
-Â Â // Dispatcher
-Â Â template<class M, class E1, class E2,
class TRI>
+Â Â template<class M, class E1, class E2,
+Â Â Â Â Â Â Â typename B =
detail::prod_block_size<typename common_type<typename
E1::value_type,
+Â Â Â Â Â Â Â Â Â
        Â
       Â
   typename E2::value_type>::type>
>
+Â Â BOOST_UBLAS_INLINE
+Â Â M&
+Â Â axpy_prod (const matrix_expression<E1>
&e1,
+Â Â Â Â Â Â
   const matrix_expression<E2>
&e2,
+Â Â Â Â Â Â
   M &m, full,
+Â Â Â Â Â
   dense_proxy_tag, bool init = true)
+Â Â {
+Â Â Â Â typedef typename M::value_type
value_type;
+
+Â Â Â Â if (m.size1() < B::limit ||
m.size2() < B::limit) {
+Â Â Â Â Â Â typedef typename
M::storage_category storage_category;
+Â Â Â Â Â Â typedef typename
M::orientation_category orientation_category;
+
+Â Â Â Â Â Â if (init)
+Â Â Â Â Â Â Â Â
m.assign (zero_matrix<value_type> (e1 ().size1 (), e2
().size2 ()));
+Â Â Â Â Â Â return axpy_prod
(e1, e2, m, full (), storage_category (),
+Â Â Â Â Â Â Â Â Â
   orientation_category ());
+Â Â Â Â } else {
+
+
+Â Â Â Â Â Â
detail::gemm<E1, E2, M, B>(value_type(1), e1, e2,
+Â Â Â Â Â Â Â Â Â
    Â
   value_type(init? 0 : 1), m);
+Â Â Â Â Â Â return m;
+Â Â Â Â }
+Â Â }
+
+Â Â // Dispatchers
+Â Â template<class M, class E1, class E2,
class TRI, class S,
+Â Â Â Â Â Â Â typename B =
detail::prod_block_size<typename common_type<typename
E1::value_type,
+Â Â Â Â Â Â Â Â Â
        Â
       Â
   typename E2::value_type>::type>
>
    BOOST_UBLAS_INLINE
    M &
    axpy_prod (const
matrix_expression<E1> &e1,
       Â
const matrix_expression<E2> &e2,
-Â Â Â Â Â Â
   M &m, TRI, bool init = true) {
+Â Â Â Â Â Â
   M &m, TRI,
+Â Â Â Â Â Â
   S, bool init = true) {
      typedef typename
M::value_type value_type;
      typedef typename
M::storage_category storage_category;
      typedef typename
M::orientation_category orientation_category;
@@ -651,9 +683,31 @@ namespace boost { namespace numeric {
namespace ublas {
      if (init)
    Â
   m.assign (zero_matrix<value_type>
(e1 ().size1 (), e2 ().size2 ()));
-Â Â Â Â return axpy_prod (e1, e2, m,
triangular_restriction (), storage_category (),
orientation_category ());
+
+Â Â Â Â return axpy_prod (e1, e2, m,
triangular_restriction (), storage_category (),
+Â Â Â Â Â Â Â Â Â
 orientation_category ());
    }
-Â Â template<class M, class E1, class E2,
class TRI>
+
+
+Â Â template<class M, class E1, class E2,
class TRI,
+Â Â Â Â Â Â Â typename B =
detail::prod_block_size<typename common_type<typename
E1::value_type,
+Â Â Â Â Â Â Â Â Â
        Â
       Â
   typename E2::value_type>::type>
>
+Â Â BOOST_UBLAS_INLINE
+Â Â M &
+Â Â axpy_prod (const matrix_expression<E1>
&e1,
+Â Â Â Â Â Â
   const matrix_expression<E2>
&e2,
+Â Â Â Â Â Â
   M &m, TRI, bool init = true) {
+Â Â Â Â typedef typename M::value_type
value_type;
+Â Â Â Â typedef typename
M::storage_category storage_category;
+Â Â Â Â typedef typename
M::orientation_category orientation_category;
+Â Â Â Â typedef TRI
triangular_restriction;
+
+Â Â Â Â return axpy_prod<M, E1, E2,
TRI, B> (e1, e2, m, triangular_restriction (),
+Â Â Â Â Â Â Â Â Â
      Â
   storage_category (), init);
+Â Â }
+Â Â template<class M, class E1, class E2,
class TRI,
+Â Â Â Â Â Â Â typename B =
detail::prod_block_size<typename common_type<typename
E1::value_type,
+Â Â Â Â Â Â Â Â Â
        Â
       Â
   typename E2::value_type>::type>
>
    BOOST_UBLAS_INLINE
    M
    axpy_prod (const
matrix_expression<E1> &e1,
@@ -663,7 +717,8 @@ namespace boost { namespace numeric {
namespace ublas {
      typedef TRI
triangular_restriction;
      matrix_type m (e1
().size1 (), e2 ().size2 ());
-Â Â Â Â return axpy_prod (e1, e2, m,
triangular_restriction (), true);
+Â Â Â Â return axpy_prod<M, E1, E2,
TRI, B> (e1, e2, m, triangular_restriction (),
+Â Â Â Â Â Â Â Â Â
      Â
   true);
    }
   /** \brief computes <tt>M += A
X</tt> or <tt>M = A X</tt> in an
@@ -690,7 +745,9 @@ namespace boost { namespace numeric {
namespace ublas {
       \param E1 type
of a matrix expression \c A
       \param E2 type
of a matrix expression \c X
   */
-Â Â template<class M, class E1, class E2>
+Â Â template<class M, class E1, class E2,
+Â Â Â Â Â Â Â typename B =
detail::prod_block_size<typename common_type<typename
E1::value_type,
+Â Â Â Â Â Â Â Â Â
        Â
       Â
   typename E2::value_type>::type>
>
    BOOST_UBLAS_INLINE
    M &
    axpy_prod (const
matrix_expression<E1> &e1,
@@ -700,11 +757,15 @@ namespace boost { namespace numeric {
namespace ublas {
      typedef typename
M::storage_category storage_category;
      typedef typename
M::orientation_category orientation_category;
-Â Â Â Â if (init)
-Â Â Â Â Â Â m.assign
(zero_matrix<value_type> (e1 ().size1 (), e2 ().size2
()));
-Â Â Â Â return axpy_prod (e1, e2, m,
full (), storage_category (), orientation_category ());
+Â Â Â Â // if (init)
+Â Â Â Â //Â
   m.assign (zero_matrix<value_type>
(e1 ().size1 (), e2 ().size2 ()));
+Â Â Â Â return axpy_prod<M, E1, E2,
B> (e1, e2, m, full (), storage_category (),
+Â Â Â Â Â Â Â Â Â
      init);
    }
-Â Â template<class M, class E1, class E2>
+
+Â Â template<class M, class E1, class E2,
+Â Â Â Â Â Â Â typename B =
detail::prod_block_size<typename common_type<typename
E1::value_type,
+Â Â Â Â Â Â Â Â Â
        Â
       Â
   typename E2::value_type>::type>
>
    BOOST_UBLAS_INLINE
    M
    axpy_prod (const
matrix_expression<E1> &e1,
@@ -712,7 +773,7 @@ namespace boost { namespace numeric {
namespace ublas {
      typedef M
matrix_type;
      matrix_type m (e1
().size1 (), e2 ().size2 ());
-Â Â Â Â return axpy_prod (e1, e2, m,
full (), true);
+Â Â Â Â return axpy_prod<M, E1, E2,
B> (e1, e2, m, full (), true);
    }
--
1.9.1