|
Ublas : |
Subject: [ublas] [PATCH 2/3] boost::ublas: gcc support for optimal matrix multiplication
From: Imre Palik (imre_palik_at_[hidden])
Date: 2016-02-29 02:46:37
This patch contains an optimised, vectorised kernel, using gcc's
SIMD vectors. This reaches matrix multiplication speeds
comparable to hand-crafted assembly kernels on x86 Haswell
microarchitecture.
The patch also contains optimised kerenl parameters for float,
double, and long double.
Signed-off-by: Imre Palik <imrep_at_[hidden]>
Cc: Michael Lehn <michael.lehn_at_[hidden]>
--- include/boost/numeric/ublas/detail/block_sizes.hpp | 78 +++++++ include/boost/numeric/ublas/detail/gemm.hpp | 223 +++++++++++---------- include/boost/numeric/ublas/detail/vector.hpp | 27 +++ include/boost/numeric/ublas/matrix_expression.hpp | 1 + include/boost/numeric/ublas/operation.hpp | 1 + 5 files changed, 223 insertions(+), 107 deletions(-) create mode 100644 include/boost/numeric/ublas/detail/block_sizes.hpp create mode 100644 include/boost/numeric/ublas/detail/vector.hpp diff --git a/include/boost/numeric/ublas/detail/block_sizes.hpp b/include/boost/numeric/ublas/detail/block_sizes.hpp new file mode 100644 index 0000000..a146f10 --- /dev/null +++ b/include/boost/numeric/ublas/detail/block_sizes.hpp @@ -0,0 +1,78 @@ +// +// Copyright (c) 2016 +// Michael Lehn, Imre Palik +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef _BOOST_UBLAS_BLOCK_SIZES_ +#define _BOOST_UBLAS_BLOCK_SIZES_ + +#include <boost/numeric/ublas/detail/vector.hpp> + +namespace boost { namespace numeric { namespace ublas { namespace detail { + + template <typename T> + struct prod_block_size { + static const unsigned vector_length = + _BOOST_UBLAS_VECTOR_SIZE > sizeof(T)? _BOOST_UBLAS_VECTOR_SIZE/sizeof(T) : 1; // Number of elements in a vector register + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = (4096/(3 * vector_length)) * (3 * vector_length); + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 3 * vector_length; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 27; // Use gemm from this size + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size<float> { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 4; // stripe width for lhs + static const unsigned nr = 16; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 27; // Use gemm from this size + static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(float); // Number of elements in a vector register + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template <> + struct prod_block_size<long double> { + static const unsigned mc = 256; + static const unsigned kc = 512; // stripe length + static const unsigned nc = 4096; + static const unsigned mr = 1; // stripe width for lhs + static const unsigned nr = 4; // stripe width for rhs + static const unsigned align = 64; // align temporary arrays to this boundary + static const unsigned limit = 71; // Use gemm from this size + static const unsigned vector_length = 1; // Number of elements in a vector register + BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); + BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); + BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); + }; + + template<typename T> + struct is_blocksize { + struct fallback { static const int nr = 0; }; + struct derived : T, fallback {}; + template<int C1> + struct nonr { + static const bool value = false; + typedef false_type type; + }; + + template<typename C> static char (&f(typename nonr<C::nr>::type*))[1]; + template<typename C> static char (&f(...))[2]; + + static bool const value = sizeof(f<derived>(0)) == 2; + }; +}}}} +#endif diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp index f66e80f..7fd0ab9 100644 --- a/include/boost/numeric/ublas/detail/gemm.hpp +++ b/include/boost/numeric/ublas/detail/gemm.hpp @@ -10,80 +10,26 @@ #define _BOOST_UBLAS_GEMM_ #include <boost/type_traits/common_type.hpp> +#include <boost/type_traits/aligned_storage.hpp> +#include <boost/type_traits/is_arithmetic.hpp> #include <boost/align/aligned_alloc.hpp> #include <boost/align/assume_aligned.hpp> #include <boost/static_assert.hpp> #include <boost/numeric/ublas/matrix_proxy.hpp> +#include <boost/numeric/ublas/detail/vector.hpp> +#include <boost/predef/compiler.h> namespace boost { namespace numeric { namespace ublas { namespace detail { - template <typename T> - struct prod_block_size { - static const unsigned mc = 256; - static const unsigned kc = 512; // stripe length - static const unsigned nc = 4092; - static const unsigned mr = 4; // stripe width for lhs - static const unsigned nr = 12; // stripe width for rhs - static const unsigned align = 64; // align temporary arrays to this boundary - static const unsigned limit = 26; // Use gemm from this size - BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); - BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); - BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); - }; - - template <> - struct prod_block_size<float> { - static const unsigned mc = 256; - static const unsigned kc = 512; // stripe length - static const unsigned nc = 4080; - static const unsigned mr = 4; // stripe width for lhs - static const unsigned nr = 24; // stripe width for rhs - static const unsigned align = 64; // align temporary arrays to this boundary - static const unsigned limit = 26; // Use gemm from this size - BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); - BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); - BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); - }; - - template <> - struct prod_block_size<long double> { - static const unsigned mc = 256; - static const unsigned kc = 512; // stripe length - static const unsigned nc = 4096; - static const unsigned mr = 2; // stripe width for lhs - static const unsigned nr = 2; // stripe width for rhs - static const unsigned align = 64; // align temporary arrays to this boundary - static const unsigned limit = 26; // Use gemm from this size - BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size."); - BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR."); - BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR."); - }; - - template<typename T> - struct is_blocksize { - struct fallback { static const int nr = 0; }; - struct derived : T, fallback {}; - template<int C1> - struct nonr { - static const bool value = false; - typedef false_type type; - }; - - template<typename C> static char (&f(typename nonr<C::nr>::type*))[1]; - template<typename C> static char (&f(...))[2]; - - static bool const value = sizeof(f<derived>(0)) == 2; - }; - template <typename E> void gescal(const typename E::value_type &alpha, matrix_expression<E> &X) { typedef typename E::size_type size_type; - for (size_type i=0; i<X().size1(); ++i) { + for (size_type i=0; i<X().size1(); ++i) { for (size_type j=0; j<X().size2(); ++j) { - X()(i,j) *= alpha; + X()(i,j) *= alpha; } } } @@ -114,9 +60,73 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { } } - //-- Micro Kernel -------------------------------------------------------------- - template <typename Index, typename T, typename TC, typename BlockSize> + //-- Micro Kernel ---------------------------------------------------------- +#if defined(BOOST_COMP_GNUC_DETECTION) \ + && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0) + template <typename Index, typename T, typename TC, + typename BlockSize> + typename enable_if_c<is_arithmetic<T>::value + && (1 < BlockSize::vector_length), + void>::type + ugemm(Index kc, TC alpha, const T *A, const T *B, + TC beta, TC *C, Index incRowC, Index incColC) + { + BOOST_ALIGN_ASSUME_ALIGNED (A, BlockSize::align); + BOOST_ALIGN_ASSUME_ALIGNED (B, BlockSize::align); + static const unsigned vector_length = BlockSize::vector_length; + static const Index MR = BlockSize::mr; + static const Index NR = BlockSize::nr/vector_length; + + typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE))); + + vx P[MR*NR] = {}; + + const vx *b = (const vx *)B; + for (Index l=0; l<kc; ++l) { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR; ++j) { + P[i*NR+j] += A[i]*b[j]; + } + } + A += MR; + b += NR; + } + + if (alpha!=TC(1)) { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR; ++j) { + P[i*NR+j] *= alpha; + } + } + } + + const T *p = (const T *) P; + if (beta == TC(0)) { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR * vector_length; ++j) { + C[i*incRowC+j*incColC] = p[i*NR * vector_length +j]; + } + } + } else { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR * vector_length; ++j) { + C[i*incRowC+j*incColC] *= beta; + C[i*incRowC+j*incColC] += p[i*NR * vector_length+j]; + } + } + } + } + + template <typename Index, typename T, typename TC, + typename BlockSize> + typename enable_if_c<!is_arithmetic<T>::value + || (BlockSize::vector_length <= 1), + void>::type +#else + template <typename Index, typename T, typename TC, + typename BlockSize> void +#endif ugemm(Index kc, TC alpha, const T *A, const T *B, TC beta, TC *C, Index incRowC, Index incColC) { @@ -124,47 +134,47 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align); static const Index MR = BlockSize::mr; static const Index NR = BlockSize::nr; - typename boost::aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa; - T *P = reinterpret_cast<T*>(Pa.address()); - for (unsigned c = 0; c < MR * NR; c++) - P[c] = 0; + typename aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa; + T *P = reinterpret_cast<T*>(Pa.address()); + for (unsigned c = 0; c < MR * NR; c++) + P[c] = 0; for (Index l=0; l<kc; ++l) { - for (Index i=0; i<MR; ++i) { + for (Index i=0; i<MR; ++i) { for (Index j=0; j<NR; ++j) { P[i* NR+j] += A[i]*B[j]; } } - A += MR; - B += NR; + A += MR; + B += NR; + } + + if (alpha!=TC(1)) { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR; ++j) { + P[i*NR+j] *= alpha; + } + } } - if (alpha!=TC(1)) { - for (Index i=0; i<MR; ++i) { - for (Index j=0; j<NR; ++j) { - P[i*NR+j] *= alpha; - } - } - } - - if (beta == TC(0)) { - for (Index i=0; i<MR; ++i) { - for (Index j=0; j<NR; ++j) { + if (beta == TC(0)) { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR; ++j) { C[i*incRowC+j*incColC] = P[i*NR+j]; - } - } - } else { - for (Index i=0; i<MR; ++i) { - for (Index j=0; j<NR; ++j) { + } + } + } else { + for (Index i=0; i<MR; ++i) { + for (Index j=0; j<NR; ++j) { C[i*incRowC+j*incColC] *= beta; C[i*incRowC+j*incColC] += P[i*NR+j]; - } - } - } + } + } + } } - //-- Macro Kernel -------------------------------------------------------------- - template <typename Index, typename T, typename TC, typename BlockSize> + //-- Macro Kernel ---------------------------------------------------------- + template <typename Index, typename T, typename TC, typename BlockSize> void mgemm(Index mc, Index nc, Index kc, TC alpha, const T *A, const T *B, TC beta, @@ -181,9 +191,6 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { // #pragma omp parallel for // #endif for (Index j=0; j<np; ++j) { - // __builtin_prefetch(B + j * kc * NR, 0); - // __builtin_prefetch(B + j * kc * NR + 8, 0); - // __builtin_prefetch(B + j * kc * NR + 16, 0); const Index nr = (j!=np-1 || nr_==0) ? NR : nr_; T C_[MR*NR]; @@ -191,7 +198,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_; if (mr==MR && nr==NR) { - ugemm<Index, T, TC, BlockSize>(kc, alpha, + ugemm<Index, T, TC, BlockSize>(kc, alpha, &A[i*kc*MR], &B[j*kc*NR], beta, &C[i*MR*incRowC+j*NR*incColC], @@ -214,11 +221,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { } //-- Packing blocks ------------------------------------------------------------ - template <typename E, typename T, typename BlockSize> + template <typename E, typename T, typename BlockSize> void pack_A(const matrix_expression<E> &A, T *p) { typedef typename E::size_type size_type; + BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align); const size_type mc = A ().size1(); const size_type kc = A ().size2(); @@ -236,11 +244,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { } } - template <typename E, typename T, typename BlockSize> + template <typename E, typename T, typename BlockSize> void pack_B(const matrix_expression<E> &B, T *p) { typedef typename E::size_type size_type; + BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align); const size_type kc = B ().size1(); const size_type nc = B ().size2(); @@ -262,9 +271,9 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { template <typename E1, typename E2, typename E3, typename BlockSize> void gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1, - const matrix_expression<E2> &e2, + const matrix_expression<E2> &e2, typename E3::value_type beta, matrix_expression<E3> &e3, - BlockSize) + BlockSize) { typedef typename E3::size_type size_type; typedef typename E1::value_type value_type1; @@ -276,7 +285,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { static const size_type NC = BlockSize::nc; const size_type m = BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ()); - const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ()); + const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ()); const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ()); static const size_type KC = BlockSize::kc; @@ -291,11 +300,11 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { const size_type incRowC = &e3()(1,0) - &e3()(0,0); const size_type incColC = &e3()(0,1) - &e3()(0,0); value_type_i *A = - static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align, - sizeof(value_type_i) * MC*KC)); + static_cast<value_type_i *>(::boost::alignment::aligned_alloc(BlockSize::align, + sizeof(value_type_i) * MC*KC)); value_type_i *B = - static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align, - sizeof(value_type_i) * KC*NC)); + static_cast<value_type_i *>(::boost::alignment::aligned_alloc(BlockSize::align, + sizeof(value_type_i) * KC*NC)); if (alpha==value_type3(0) || k==0) { gescal(beta, e3); @@ -319,13 +328,13 @@ namespace boost { namespace numeric { namespace ublas { namespace detail { pack_A<matrix_range<const E1>, value_type_i, BlockSize>(As, A); mgemm<size_type, value_type_i, value_type3, BlockSize>(mc, nc, kc, alpha, A, B, beta_, - &C_[i*MC*incRowC+j*NC*incColC], - incRowC, incColC); + &C_[i*MC*incRowC+j*NC*incColC], + incRowC, incColC); } } } - boost::alignment::aligned_free(A); - boost::alignment::aligned_free(B); + ::boost::alignment::aligned_free(A); + ::boost::alignment::aligned_free(B); } }}}} #endif diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp new file mode 100644 index 0000000..1681eda --- /dev/null +++ b/include/boost/numeric/ublas/detail/vector.hpp @@ -0,0 +1,27 @@ +// +// Copyright (c) 2016 +// Imre Palik +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#ifndef _BOOST_UBLAS_DETAIL_VECTOR_ +#define _BOOST_UBLAS_DETAIL_VECTOR_ +#include <boost/predef/hardware/simd.h> + +#if defined(BOOST_HW_SIMD) && (BOOST_HW_SIMD == BOOST_HW_SIMD_X86 || BOOST_HW_SIMD == BOOST_HW_SIMD_X86_AMD) +#if BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_AVX_VERSION +#define _BOOST_UBLAS_VECTOR_SIZE 32 +#elif BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_SSE_VERSION +#define _BOOST_UBLAS_VECTOR_SIZE 16 +#else +#define _BOOST_UBLAS_VECTOR_SIZE 8 +#endif +#endif + +#ifndef _BOOST_UBLAS_VECTOR_SIZE +#define _BOOST_UBLAS_VECTOR_SIZE 1 +#endif + +#endif diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp index 78daf2b..c4e0ac8 100644 --- a/include/boost/numeric/ublas/matrix_expression.hpp +++ b/include/boost/numeric/ublas/matrix_expression.hpp @@ -15,6 +15,7 @@ #include <boost/numeric/ublas/vector_expression.hpp> #include <boost/numeric/ublas/detail/gemm.hpp> +#include <boost/numeric/ublas/detail/block_sizes.hpp> // Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish // Iterators based on ideas of Jeremy Siek diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp index 393a9bd..8fd3b63 100644 --- a/include/boost/numeric/ublas/operation.hpp +++ b/include/boost/numeric/ublas/operation.hpp @@ -15,6 +15,7 @@ #include <boost/numeric/ublas/matrix_proxy.hpp> #include <boost/numeric/ublas/detail/gemm.hpp> +#include <boost/numeric/ublas/detail/block_sizes.hpp> /** \file operation.hpp * \brief This file contains some specialized products. */ -- 1.9.1