|
Ublas : |
Subject: [ublas] [PATCH 1/3] ublas: improved dense matrix multiplication performance
From: Imre Palik (imre_palik_at_[hidden])
Date: 2016-02-29 02:46:36
This patch includes the gemm implementation from Michael Lehn to
boost::ublas.
This modifies the workings of ublas::prod() and ublas::axppy_prod()
to use gemm() above a certain matrix size.
This patch only contains the basic architecture, and a generic c++
implementation.
Signed-off-by: Imre Palik <imre_palik_at_[hidden]>
Cc: Michael Lehn <michael.lehn_at_[hidden]>
--- include/boost/numeric/ublas/detail/gemm.hpp | 331 ++++++++++++++++++++++ include/boost/numeric/ublas/matrix_expression.hpp | 139 ++++++++- include/boost/numeric/ublas/operation.hpp | 154 +++++++++- 3 files changed, 605 insertions(+), 19 deletions(-) create mode 100644 include/boost/numeric/ublas/detail/gemm.hpp diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp new file mode 100644 index 0000000..f66e80f --- /dev/null +++ b/include/boost/numeric/ublas/detail/gemm.hpp @@ -0,0 +1,331 @@ +// +// Copyright (c) 2016 +// Michael Lehn, Imre Palik +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef _BOOST_UBLAS_GEMM_ +#define _BOOST_UBLAS_GEMM_ + +#include <boost/type_traits/common_type.hpp> +#include <boost/align/aligned_alloc.hpp> +#include <boost/align/assume_aligned.hpp> +#include <boost/static_assert.hpp> +#include <boost/numeric/ublas/matrix_proxy.hpp> + +namespace boost { namespace numeric { namespace ublas { namespace detail { + + template <typename T> + struct prod_block_size { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4092; + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 12; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 26; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size<float> { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4080; + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 24; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 26; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size<long double> { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 2; // stripe width for lhs + static const unsigned nr = 2; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 26; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template<typename T> + struct is_blocksize { + struct fallback { static const int nr = 0; }; + struct derived : T, fallback {}; + template<int C1> + struct nonr { + static const bool value = false; + typedef false_type type; + }; + + template<typename C> static char (&f(typename nonr<C::nr>::type*))[1]; + template<typename C> static char (&f(...))[2]; + + static bool const value = sizeof(f<derived>(0)) == 2; + }; + + template <typename E> + void + gescal(const typename E::value_type &alpha, matrix_expression<E> &X) + { + typedef typename E::size_type size_type; + + for (size_type i=0; i<X().size1(); ++i) { + for (size_type j=0; j<X().size2(); ++j) { + X()(i,j) *= alpha; + } + } + } + + template <typename Index, typename T> + void + geaxpy(Index m, Index n, const T &alpha, + const T *X, Index incRowX, Index incColX, + T *Y, Index incRowY, Index incColY) + { + for (Index j=0; j<n; ++j) { + for (Index i=0; i<m; ++i) { + Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX]; + } + } + } + + template <typename Index, typename TX> + void + gescal(Index m, Index n, + const TX &alpha, + TX *X, Index incRowX, Index incColX) + { + for (Index j=0; j<n; ++j) { + for (Index i=0; i<m; ++i) { + X[i*incRowX+j*incColX] *= alpha; + } + } + } + + //-- Micro Kernel -------------------------------------------------------------- + template <typename Index, typename T, typename TC, typename BlockSize> + void + ugemm(Index kc, TC alpha, const T *A, const T *B, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED(A, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align); + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + typename boost::aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa; + T *P = reinterpret_cast<T*>(Pa.address()); + for (unsigned c = 0; c < MR * NR; c++) + P[c] = 0; + + for (Index l=0; l<kc; ++l) { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR; ++j) { + P[i* NR+j] += A[i]*B[j]; + } + } + A += MR; + B += NR; + } + + if (alpha!=TC(1)) { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR; ++j) { + P[i*NR+j] *= alpha; + } + } + } + + if (beta == TC(0)) { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR; ++j) { + C[i*incRowC+j*incColC] = P[i*NR+j]; + } + } + } else { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR; ++j) { + C[i*incRowC+j*incColC] *= beta; + C[i*incRowC+j*incColC] += P[i*NR+j]; + } + } + } + } + + //-- Macro Kernel -------------------------------------------------------------- + template <typename Index, typename T, typename TC, typename BlockSize> + void + mgemm(Index mc, Index nc, Index kc, TC alpha, + const T *A, const T *B, TC beta, + TC *C, Index incRowC, Index incColC) + { + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr; + const Index mp = (mc+MR-1) / MR; + const Index np = (nc+NR-1) / NR; + const Index mr_ = mc % MR; + const Index nr_ = nc % NR; + + // #if defined(_OPENMP) + // #pragma omp parallel for + // #endif + for (Index j=0; j<np; ++j) { + // __builtin_prefetch(B + j * kc * NR, 0); + // __builtin_prefetch(B + j * kc * NR + 8, 0); + // __builtin_prefetch(B + j * kc * NR + 16, 0); + const Index nr = (j!=np-1 || nr_==0) ? NR : nr_; + T C_[MR*NR]; + + for (Index i=0; i<mp; ++i) { + const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_; + + if (mr==MR && nr==NR) { + ugemm<Index, T, TC, BlockSize>(kc, alpha, + &A[i*kc*MR], &B[j*kc*NR], + beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } else { + std::fill_n(C_, MR*NR, T(0)); + ugemm<Index, T, TC, BlockSize>(kc, alpha, + &A[i*kc*MR], &B[j*kc*NR], + T(0), + C_, NR, Index(1)); + gescal(mr, nr, beta, + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + geaxpy(mr, nr, T(1), C_, NR, Index(1), + &C[i*MR*incRowC+j*NR*incColC], + incRowC, incColC); + } + } + } + } + + //-- Packing blocks ------------------------------------------------------------ + template <typename E, typename T, typename BlockSize> + void + pack_A(const matrix_expression<E> &A, T *p) + { + typedef typename E::size_type size_type; + + const size_type mc = A ().size1(); + const size_type kc = A ().size2(); + static const size_type MR = BlockSize::mr; + const size_type mp = (mc+MR-1) / MR; + + for (size_type j=0; j<kc; ++j) { + for (size_type l=0; l<mp; ++l) { + for (size_type i0=0; i0<MR; ++i0) { + size_type i = l*MR + i0; + size_type nu = l*MR*kc + j*MR + i0; + p[nu] = (i<mc) ? A()(i,j) : T(0); + } + } + } + } + + template <typename E, typename T, typename BlockSize> + void + pack_B(const matrix_expression<E> &B, T *p) + { + typedef typename E::size_type size_type; + + const size_type kc = B ().size1(); + const size_type nc = B ().size2(); + static const size_type NR = BlockSize::nr; + const size_type np = (nc+NR-1) / NR; + + for (size_type l=0; l<np; ++l) { + for (size_type j0=0; j0<NR; ++j0) { + for (size_type i=0; i<kc; ++i) { + size_type j = l*NR+j0; + size_type nu = l*NR*kc + i*NR + j0; + p[nu] = (j<nc) ? B()(i,j) : T(0); + } + } + } + } + + //-- Frame routine ------------------------------------------------------------- + template <typename E1, typename E2, typename E3, typename BlockSize> + void + gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + typename E3::value_type beta, matrix_expression<E3> &e3, + BlockSize) + { + typedef typename E3::size_type size_type; + typedef typename E1::value_type value_type1; + typedef typename E2::value_type value_type2; + typedef typename E3::value_type value_type3; + typedef typename common_type<value_type1, value_type2>::type value_type_i; + + static const size_type MC = BlockSize::mc; + static const size_type NC = BlockSize::nc; + + const size_type m = BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ()); + const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ()); + const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); + + static const size_type KC = BlockSize::kc; + const size_type mb = (m+MC-1) / MC; + const size_type nb = (n+NC-1) / NC; + const size_type kb = (k+KC-1) / KC; + const size_type mc_ = m % MC; + const size_type nc_ = n % NC; + const size_type kc_ = k % KC; + + value_type3 *C_ = &e3()(0,0); + const size_type incRowC = &e3()(1,0) - &e3()(0,0); + const size_type incColC = &e3()(0,1) - &e3()(0,0); + value_type_i *A = + static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align, + sizeof(value_type_i) * MC*KC)); + value_type_i *B = + static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align, + sizeof(value_type_i) * KC*NC)); + + if (alpha==value_type3(0) || k==0) { + gescal(beta, e3); + return; + } + + for (size_type j=0; j<nb; ++j) { + size_type nc = (j!=nb-1 || nc_==0) ? NC : nc_; + + for (size_type l=0; l<kb; ++l) { + size_type kc = (l!=kb-1 || kc_==0) ? KC : kc_; + value_type3 beta_ = (l==0) ? beta : value_type3(1); + + const matrix_range<const E2> Bs = subrange(e2(), l*KC, l*KC+kc, j*NC, j*NC+nc); + pack_B<matrix_range<const E2>, value_type_i, BlockSize>(Bs, B); + + for (size_type i=0; i<mb; ++i) { + size_type mc = (i!=mb-1 || mc_==0) ? MC : mc_; + + const matrix_range<const E1> As = subrange(e1(), i*MC, i*MC+mc, l*KC, l*KC+kc); + pack_A<matrix_range<const E1>, value_type_i, BlockSize>(As, A); + + mgemm<size_type, value_type_i, value_type3, BlockSize>(mc, nc, kc, alpha, A, B, beta_, + &C_[i*MC*incRowC+j*NC*incColC], + incRowC, incColC); + } + } + } + boost::alignment::aligned_free(A); + boost::alignment::aligned_free(B); + } +}}}} +#endif diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp index a363130..78daf2b 100644 --- a/include/boost/numeric/ublas/matrix_expression.hpp +++ b/include/boost/numeric/ublas/matrix_expression.hpp @@ -14,6 +14,7 @@ #define _BOOST_UBLAS_MATRIX_EXPRESSION_ #include <boost/numeric/ublas/vector_expression.hpp> +#include <boost/numeric/ublas/detail/gemm.hpp> // Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish // Iterators based on ideas of Jeremy Siek @@ -5460,45 +5461,104 @@ namespace boost { namespace numeric { namespace ublas { expression2_closure_type e2_; }; + namespace detail { + template<class E1, class E2, class P, bool s> + struct binary_calculate_result_type; + + template<class E1, class E2, class P> + struct binary_calculate_result_type<E1, E2, P, false> { + typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, P> > result_type; + }; + + template<class E1, class E2, class P> + struct binary_calculate_result_type<E1, E2, P, true> { + typedef matrix<P> result_type; + }; + } + template<class T1, class E1, class T2, class E2> struct matrix_matrix_binary_traits { - typedef unknown_storage_tag storage_category; + // typedef unknown_storage_tag storage_category; + typedef typename storage_restrict_traits<typename E1::storage_category, typename E2::storage_category>::storage_category storage_category; typedef unknown_orientation_tag orientation_category; typedef typename promote_traits<T1, T2>::promote_type promote_type; typedef matrix_matrix_binary<E1, E2, matrix_matrix_prod<E1, E2, promote_type> > expression_type; #ifndef BOOST_UBLAS_SIMPLE_ET_DEBUG - typedef expression_type result_type; + // typedef expression_type result_type; + typedef typename detail::binary_calculate_result_type<E1, E2, promote_type, boost::is_base_of<dense_proxy_tag, storage_category>::value>::result_type result_type; #else typedef typename E1::matrix_temporary_type result_type; #endif }; - template<class E1, class E2> + template<class E1, class E2, class B> BOOST_UBLAS_INLINE typename matrix_matrix_binary_traits<typename E1::value_type, E1, typename E2::value_type, E2>::result_type prod (const matrix_expression<E1> &e1, const matrix_expression<E2> &e2, + B, unknown_storage_tag, unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, typename E2::value_type, E2>::expression_type expression_type; return expression_type (e1 (), e2 ()); } - // Dispatcher - template<class E1, class E2> + template<class E1, class E2, class B> BOOST_UBLAS_INLINE typename matrix_matrix_binary_traits<typename E1::value_type, E1, typename E2::value_type, E2>::result_type prod (const matrix_expression<E1> &e1, - const matrix_expression<E2> &e2) { + const matrix_expression<E2> &e2, + B b, + dense_proxy_tag, + unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::expression_type expression_type; + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::result_type result_type; + typedef typename result_type::value_type result_value; + + if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) { + return expression_type (e1 (), e2 ()); + } else { + result_type rv(e1 ().size1(), e2 ().size2()); + detail::gemm(result_value(1), e1, e2, + result_value(0), rv, b); + return rv; + } + } + + // Dispatcher + template<class E1, class E2, class B> + BOOST_UBLAS_INLINE + typename enable_if_c<detail::is_blocksize<B>::value, + typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::result_type>::type + prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + B b) { BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, typename E2::value_type, E2>::storage_category storage_category; typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, typename E2::value_type, E2>::orientation_category orientation_category; - return prod (e1, e2, storage_category (), orientation_category ()); + return prod (e1, e2, b, storage_category (), orientation_category ()); + } + + template<class E1, class E2> + BOOST_UBLAS_INLINE + typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::result_type + prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2) { + BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); + typedef typename detail::prod_block_size<typename common_type<typename E1::value_type, + typename E2::value_type>::type> block_sizes; + + return prod (e1, e2, block_sizes()); } template<class E1, class E2> @@ -5529,13 +5589,74 @@ namespace boost { namespace numeric { namespace ublas { return prec_prod (e1, e2, storage_category (), orientation_category ()); } - template<class M, class E1, class E2> + template<class M, class E1, class E2, typename B> + BOOST_UBLAS_INLINE + void + prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + M &m, B b, + unknown_storage_tag, + unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::expression_type expression_type; + + m.assign(expression_type (e1 (), e2 ())); + } + + template<class M, class E1, class E2, typename B> + BOOST_UBLAS_INLINE + void + prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + M &m, B b, + dense_proxy_tag, + unknown_orientation_tag) { + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::expression_type expression_type; + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::result_type result_type; + typedef typename result_type::value_type result_value; + + if (e1 ().size1() < B::limit || e2 ().size2() < B::limit) { + m.assign(expression_type (e1 (), e2 ())); + } else { + detail::gemm(result_value(1), e1, e2, + result_value(0), m, b); + } + } + + // dispatcher + template<class M, class E1, class E2, class B> BOOST_UBLAS_INLINE M & prod (const matrix_expression<E1> &e1, const matrix_expression<E2> &e2, + M &m, B b) { + BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::storage_category storage_category; + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::orientation_category orientation_category; + + prod (e1, e2, m, b, storage_category(), orientation_category()); + return m; + } + template<class M, class E1, class E2> + BOOST_UBLAS_INLINE + typename enable_if_c<!detail::is_blocksize<M>::value, + M&>::type + prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, M &m) { - return m.assign (prod (e1, e2)); + BOOST_STATIC_ASSERT (E1::complexity == 0 && E2::complexity == 0); + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::storage_category storage_category; + typedef typename matrix_matrix_binary_traits<typename E1::value_type, E1, + typename E2::value_type, E2>::orientation_category orientation_category; + typedef typename detail::prod_block_size<typename common_type<typename E1::value_type, + typename E2::value_type>::type> block_sizes; + prod (e1, e2, m, block_sizes(), storage_category(), orientation_category()); + return m; } template<class M, class E1, class E2> diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp index 64657cc..393a9bd 100644 --- a/include/boost/numeric/ublas/operation.hpp +++ b/include/boost/numeric/ublas/operation.hpp @@ -14,7 +14,7 @@ #define _BOOST_UBLAS_OPERATION_ #include <boost/numeric/ublas/matrix_proxy.hpp> - +#include <boost/numeric/ublas/detail/gemm.hpp> /** \file operation.hpp * \brief This file contains some specialized products. */ @@ -637,7 +637,63 @@ namespace boost { namespace numeric { namespace ublas { return m; } - // Dispatcher + template<class M, class E1, class E2, class S, class B> + BOOST_UBLAS_INLINE + M& + axpy_prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + M &m, full, + S, bool init, B) + { + typedef typename M::value_type value_type; + typedef S storage_category; + typedef typename M::orientation_category orientation_category; + + + if (init) + m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ())); + return axpy_prod (e1, e2, m, full (), storage_category (), + orientation_category ()); + } + + template<class M, class E1, class E2, class B> + BOOST_UBLAS_INLINE + M& + axpy_prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + M &m, full, + dense_proxy_tag, bool init, B) + { + typedef typename M::value_type value_type; + typedef B block_sizes; + if (m.size1() < B::limit || m.size2() < B::limit) { + typedef typename M::storage_category storage_category; + typedef typename M::orientation_category orientation_category; + + if (init) + m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ())); + return axpy_prod (e1, e2, m, full (), storage_category (), + orientation_category ()); + } else { + detail::gemm(value_type(1), e1, e2, + value_type(init? 0 : 1), m, block_sizes()); + return m; + } + } + + template<class M, class E1, class E2, class B> + BOOST_UBLAS_INLINE + M& + axpy_prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + M &m, full, + dense_tag, bool init, B b) + { + return axpy_prod (e1, e2, m, full (), dense_proxy_tag(), + init, b); + } + + // Dispatchers template<class M, class E1, class E2, class TRI> BOOST_UBLAS_INLINE M & @@ -651,11 +707,16 @@ namespace boost { namespace numeric { namespace ublas { if (init) m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ())); - return axpy_prod (e1, e2, m, triangular_restriction (), storage_category (), orientation_category ()); + + return axpy_prod(e1, e2, m, triangular_restriction (), + storage_category (), orientation_category(), + init); } + template<class M, class E1, class E2, class TRI> BOOST_UBLAS_INLINE - M + typename enable_if_c<!detail::is_blocksize<TRI>::value, + M>::type axpy_prod (const matrix_expression<E1> &e1, const matrix_expression<E2> &e2, TRI) { @@ -690,29 +751,61 @@ namespace boost { namespace numeric { namespace ublas { \param E1 type of a matrix expression \c A \param E2 type of a matrix expression \c X */ + template<class M, class E1, class E2, class B> + BOOST_UBLAS_INLINE + M & + axpy_prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + M &m, bool init, B) { + typedef typename M::storage_category storage_category; + typedef B block_sizes; + + return axpy_prod(e1, e2, m, full (), storage_category (), + init, block_sizes()); + } + template<class M, class E1, class E2> BOOST_UBLAS_INLINE M & axpy_prod (const matrix_expression<E1> &e1, const matrix_expression<E2> &e2, M &m, bool init = true) { - typedef typename M::value_type value_type; typedef typename M::storage_category storage_category; - typedef typename M::orientation_category orientation_category; + typedef typename detail::prod_block_size<typename common_type<typename E1::value_type, + typename E2::value_type>::type> block_sizes; - if (init) - m.assign (zero_matrix<value_type> (e1 ().size1 (), e2 ().size2 ())); - return axpy_prod (e1, e2, m, full (), storage_category (), orientation_category ()); + return axpy_prod(e1, e2, m, full (), storage_category (), + init, block_sizes()); + } + + template<class M, class E1, class E2, class B> + BOOST_UBLAS_INLINE + typename boost::enable_if_c<detail::is_blocksize<B>::value, + M>::type + axpy_prod (const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, B) { + typedef M matrix_type; + typedef typename M::storage_category storage_category; + typedef B block_sizes; + + matrix_type m (e1 ().size1 (), e2 ().size2 ()); + return axpy_prod(e1, e2, m, full (), storage_category(), true, + block_sizes()); } + template<class M, class E1, class E2> BOOST_UBLAS_INLINE M axpy_prod (const matrix_expression<E1> &e1, const matrix_expression<E2> &e2) { typedef M matrix_type; + typedef typename M::storage_category storage_category; + typedef typename detail::prod_block_size<typename common_type<typename E1::value_type, + typename E2::value_type>::type> block_sizes; matrix_type m (e1 ().size1 (), e2 ().size2 ()); - return axpy_prod (e1, e2, m, full (), true); + return axpy_prod(e1, e2, m, full (), storage_category(), true, + block_sizes()); } @@ -825,6 +918,47 @@ namespace boost { namespace numeric { namespace ublas { return opb_prod (e1, e2, m, true); } + /** \brief computes <tt>C = alpha * A * B + beta * C</tt> in an + optimized fashion. + + \param alpha scalar multiplier + \param e1 the matrix expression \c A + \param e2 the matrix expression \c B + \param beta scalar multiplier + \param e3 the result matrix \c C + + <tt>gemm</tt> implements the well known gemm operation + + \ingroup blas3 + + \internal + + template parameters: + \param E1 type of a matrix expression \c A + \param E2 type of a matrix expression \c B + \param E3 type of a matrix expression \c C + */ + template <typename E1, typename E2, typename E3> + BOOST_UBLAS_INLINE + void + gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + typename E3::value_type beta, matrix_expression<E3> &e3) { + typedef typename detail::prod_block_size<typename common_type<typename E1::value_type, + typename E2::value_type>::type> block_sizes; + detail::gemm(alpha, e1, e2, beta, e3, block_sizes()); + } + + template <typename E1, typename E2, typename E3, typename BlockSize> + BOOST_UBLAS_INLINE + void + gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1, + const matrix_expression<E2> &e2, + typename E3::value_type beta, matrix_expression<E3> &e3, + BlockSize b) { + detail::gemm(alpha, e1, e2, beta, e3, b); + } + }}} #endif -- 1.9.1