Boost logo

Ublas :

Subject: [ublas] [PATCH 2/3] boost::ublas: gcc support for optimal matrix multiplication
From: Imre Palik (imre_palik_at_[hidden])
Date: 2016-02-29 02:46:37


This patch contains an optimised, vectorised kernel, using gcc's
SIMD vectors. This reaches matrix multiplication speeds
comparable to hand-crafted assembly kernels on x86 Haswell
microarchitecture.

The patch also contains optimised kerenl parameters for float,
double, and long double.

Signed-off-by: Imre Palik <imrep_at_[hidden]>
Cc: Michael Lehn <michael.lehn_at_[hidden]>

---
 include/boost/numeric/ublas/detail/block_sizes.hpp |  78 +++++++
 include/boost/numeric/ublas/detail/gemm.hpp        | 223 +++++++++++----------
 include/boost/numeric/ublas/detail/vector.hpp      |  27 +++
 include/boost/numeric/ublas/matrix_expression.hpp  |   1 +
 include/boost/numeric/ublas/operation.hpp          |   1 +
 5 files changed, 223 insertions(+), 107 deletions(-)
 create mode 100644 include/boost/numeric/ublas/detail/block_sizes.hpp
 create mode 100644 include/boost/numeric/ublas/detail/vector.hpp
diff --git a/include/boost/numeric/ublas/detail/block_sizes.hpp b/include/boost/numeric/ublas/detail/block_sizes.hpp
new file mode 100644
index 0000000..a146f10
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/block_sizes.hpp
@@ -0,0 +1,78 @@
+//
+//  Copyright (c) 2016
+//  Michael Lehn, Imre Palik
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_BLOCK_SIZES_
+#define _BOOST_UBLAS_BLOCK_SIZES_
+
+#include <boost/numeric/ublas/detail/vector.hpp>
+
+namespace boost { namespace numeric { namespace ublas { namespace detail {
+
+    template <typename T>
+    struct prod_block_size {
+        static const unsigned vector_length =
+            _BOOST_UBLAS_VECTOR_SIZE > sizeof(T)? _BOOST_UBLAS_VECTOR_SIZE/sizeof(T) : 1; // Number of elements in a vector register
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = (4096/(3 * vector_length)) * (3 * vector_length);
+        static const unsigned mr = 4; // stripe width for lhs
+        static const unsigned nr = 3 * vector_length; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 27; // Use gemm from this size
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<float> {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 4; // stripe width for lhs
+        static const unsigned nr = 16; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 27; // Use gemm from this size
+        static const unsigned vector_length = _BOOST_UBLAS_VECTOR_SIZE/sizeof(float); // Number of elements in a vector register
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template <>
+    struct prod_block_size<long double> {
+        static const unsigned mc = 256;
+        static const unsigned kc = 512; // stripe length
+        static const unsigned nc = 4096;
+        static const unsigned mr = 1; // stripe width for lhs
+        static const unsigned nr = 4; // stripe width for rhs
+        static const unsigned align = 64; // align temporary arrays to this boundary
+        static const unsigned limit = 71; // Use gemm from this size
+        static const unsigned vector_length = 1; // Number of elements in a vector register
+        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
+        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
+        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
+    };
+
+    template<typename T>
+    struct is_blocksize {
+        struct fallback { static const int nr = 0; };
+        struct derived : T, fallback {};
+        template<int C1>
+        struct nonr {
+            static const bool value = false;
+            typedef false_type type;
+        };
+
+         template<typename C> static char (&f(typename nonr<C::nr>::type*))[1];
+         template<typename C> static char (&f(...))[2];
+
+         static bool const value = sizeof(f<derived>(0)) == 2;
+    };
+}}}}
+#endif
diff --git a/include/boost/numeric/ublas/detail/gemm.hpp b/include/boost/numeric/ublas/detail/gemm.hpp
index f66e80f..7fd0ab9 100644
--- a/include/boost/numeric/ublas/detail/gemm.hpp
+++ b/include/boost/numeric/ublas/detail/gemm.hpp
@@ -10,80 +10,26 @@
 #define _BOOST_UBLAS_GEMM_
 
 #include <boost/type_traits/common_type.hpp>
+#include <boost/type_traits/aligned_storage.hpp>
+#include <boost/type_traits/is_arithmetic.hpp>
 #include <boost/align/aligned_alloc.hpp>
 #include <boost/align/assume_aligned.hpp>
 #include <boost/static_assert.hpp>
 #include <boost/numeric/ublas/matrix_proxy.hpp>
+#include <boost/numeric/ublas/detail/vector.hpp>
+#include <boost/predef/compiler.h>
 
 namespace boost { namespace numeric { namespace ublas { namespace detail {
 
-    template <typename T>
-    struct prod_block_size {
-        static const unsigned mc = 256;
-        static const unsigned kc = 512; // stripe length
-        static const unsigned nc = 4092;
-        static const unsigned mr = 4; // stripe width for lhs
-        static const unsigned nr = 12; // stripe width for rhs
-        static const unsigned align = 64; // align temporary arrays to this boundary
-        static const unsigned limit = 26; // Use gemm from this size
-        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
-        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
-        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
-    };
-
-    template <>
-    struct prod_block_size<float> {
-        static const unsigned mc = 256;
-        static const unsigned kc = 512; // stripe length
-        static const unsigned nc = 4080;
-        static const unsigned mr = 4; // stripe width for lhs
-        static const unsigned nr = 24; // stripe width for rhs
-        static const unsigned align = 64; // align temporary arrays to this boundary
-        static const unsigned limit = 26; // Use gemm from this size
-        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
-        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
-        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
-    };
-
-    template <>
-    struct prod_block_size<long double> {
-        static const unsigned mc = 256;
-        static const unsigned kc = 512; // stripe length
-        static const unsigned nc = 4096;
-        static const unsigned mr = 2; // stripe width for lhs
-        static const unsigned nr = 2; // stripe width for rhs
-        static const unsigned align = 64; // align temporary arrays to this boundary
-        static const unsigned limit = 26; // Use gemm from this size
-        BOOST_STATIC_ASSERT_MSG(mc>0 && kc>0 && nc>0 && mr>0 && nr>0, "Invalid block size.");
-        BOOST_STATIC_ASSERT_MSG(mc % mr == 0, "MC must be a multiple of MR.");
-        BOOST_STATIC_ASSERT_MSG(nc % nr == 0, "NC must be a multiple of NR.");
-    };
-
-    template<typename T>
-    struct is_blocksize {
-        struct fallback { static const int nr = 0; };
-        struct derived : T, fallback {};
-        template<int C1>
-       struct nonr {
-           static const bool value = false;
-           typedef false_type type;
-       };
-
-         template<typename C> static char (&f(typename nonr<C::nr>::type*))[1];
-         template<typename C> static char (&f(...))[2];
-
-         static bool const value = sizeof(f<derived>(0)) == 2;
-    };
-
     template <typename E>
     void
     gescal(const typename E::value_type &alpha, matrix_expression<E> &X)
     {
         typedef typename E::size_type  size_type;
 
-       for (size_type i=0; i<X().size1(); ++i) {
+        for (size_type i=0; i<X().size1(); ++i) {
             for (size_type j=0; j<X().size2(); ++j) {
-               X()(i,j) *= alpha;
+                X()(i,j) *= alpha;
             }
         }
     }
@@ -114,9 +60,73 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         }
     }
 
-    //-- Micro Kernel --------------------------------------------------------------
-    template <typename Index, typename T, typename TC, typename BlockSize>
+    //-- Micro Kernel ----------------------------------------------------------
+#if defined(BOOST_COMP_GNUC_DETECTION) \
+    && BOOST_COMP_GNUC_DETECTION >= BOOST_VERSION_NUMBER(4,8,0)
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<is_arithmetic<T>::value
+                         && (1 < BlockSize::vector_length),
+                         void>::type
+    ugemm(Index kc, TC alpha, const T *A, const T *B,
+          TC beta, TC *C, Index incRowC, Index incColC)
+    {
+        BOOST_ALIGN_ASSUME_ALIGNED (A, BlockSize::align);
+        BOOST_ALIGN_ASSUME_ALIGNED (B, BlockSize::align);
+        static const unsigned vector_length = BlockSize::vector_length;
+        static const Index MR = BlockSize::mr;
+        static const Index NR = BlockSize::nr/vector_length;
+
+        typedef T vx __attribute__((vector_size (_BOOST_UBLAS_VECTOR_SIZE)));
+
+        vx P[MR*NR] = {};
+
+        const vx *b = (const vx *)B;
+        for (Index l=0; l<kc; ++l) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    P[i*NR+j] += A[i]*b[j];
+                }
+            }
+            A += MR;
+            b += NR;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    P[i*NR+j] *= alpha;
+                }
+            }
+        }
+
+        const T *p = (const T *) P;
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR * vector_length; ++j) {
+                    C[i*incRowC+j*incColC] = p[i*NR * vector_length +j];
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR * vector_length; ++j) {
+                    C[i*incRowC+j*incColC] *= beta;
+                    C[i*incRowC+j*incColC] += p[i*NR * vector_length+j];
+                }
+            }
+        }
+    }
+
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
+    typename enable_if_c<!is_arithmetic<T>::value
+                         || (BlockSize::vector_length <= 1),
+                         void>::type
+#else
+    template <typename Index, typename T, typename TC,
+              typename BlockSize>
     void
+#endif
     ugemm(Index kc, TC alpha, const T *A, const T *B,
           TC beta, TC *C, Index incRowC, Index incColC)
     {
@@ -124,47 +134,47 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         BOOST_ALIGN_ASSUME_ALIGNED(B, BlockSize::align);
         static const Index MR = BlockSize::mr;
         static const Index NR = BlockSize::nr;
-       typename boost::aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa;
-       T *P = reinterpret_cast<T*>(Pa.address());
-       for (unsigned c = 0; c < MR * NR; c++)
-         P[c] = 0;
+        typename aligned_storage<sizeof(T[MR*NR]),BlockSize::align>::type Pa;
+        T *P = reinterpret_cast<T*>(Pa.address());
+        for (unsigned c = 0; c < MR * NR; c++)
+          P[c] = 0;
 
         for (Index l=0; l<kc; ++l) {
-           for (Index i=0; i<MR; ++i) {
+            for (Index i=0; i<MR; ++i) {
               for (Index j=0; j<NR; ++j) {
                     P[i* NR+j] += A[i]*B[j];
                 }
             }
-           A += MR;
-           B += NR;
+            A += MR;
+            B += NR;
+        }
+
+        if (alpha!=TC(1)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
+                    P[i*NR+j] *= alpha;
+                }
+            }
         }
 
-       if (alpha!=TC(1)) {
-           for (Index i=0; i<MR; ++i) {
-               for (Index j=0; j<NR; ++j) {
-                   P[i*NR+j] *= alpha;
-               }
-           }
-       }
-
-       if (beta == TC(0)) {
-           for (Index i=0; i<MR; ++i) {
-               for (Index j=0; j<NR; ++j) {
+        if (beta == TC(0)) {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
                 C[i*incRowC+j*incColC] = P[i*NR+j];
-               }
-           }
-       } else {
-           for (Index i=0; i<MR; ++i) {
-               for (Index j=0; j<NR; ++j) {
+                }
+            }
+        } else {
+            for (Index i=0; i<MR; ++i) {
+                for (Index j=0; j<NR; ++j) {
                     C[i*incRowC+j*incColC] *= beta;
                     C[i*incRowC+j*incColC] += P[i*NR+j];
-               }
-           }
-       }
+                }
+            }
+        }
     }
 
-    //-- Macro Kernel --------------------------------------------------------------
-       template <typename Index, typename T, typename TC, typename BlockSize>
+    //-- Macro Kernel ----------------------------------------------------------
+        template <typename Index, typename T, typename TC, typename BlockSize>
     void
     mgemm(Index mc, Index nc, Index kc, TC alpha,
           const T *A, const T *B, TC beta,
@@ -181,9 +191,6 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         // #pragma omp parallel for
         // #endif
         for (Index j=0; j<np; ++j) {
-      // __builtin_prefetch(B + j * kc * NR, 0);
-      // __builtin_prefetch(B + j * kc * NR + 8, 0);
-      // __builtin_prefetch(B + j * kc * NR + 16, 0);
             const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
             T C_[MR*NR];
 
@@ -191,7 +198,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
                 const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
 
                 if (mr==MR && nr==NR) {
-                 ugemm<Index, T, TC, BlockSize>(kc, alpha,
+                  ugemm<Index, T, TC, BlockSize>(kc, alpha,
                           &A[i*kc*MR], &B[j*kc*NR],
                           beta,
                           &C[i*MR*incRowC+j*NR*incColC],
@@ -214,11 +221,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
     }
 
     //-- Packing blocks ------------------------------------------------------------
-       template <typename E, typename T, typename BlockSize>
+        template <typename E, typename T, typename BlockSize>
     void
     pack_A(const matrix_expression<E> &A, T *p)
     {
         typedef typename E::size_type  size_type;
+        BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
 
         const size_type mc = A ().size1();
         const size_type kc = A ().size2();
@@ -236,11 +244,12 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         }
     }
 
-       template <typename E, typename T, typename BlockSize>
+        template <typename E, typename T, typename BlockSize>
     void
     pack_B(const matrix_expression<E> &B, T *p)
     {
         typedef typename E::size_type  size_type;
+        BOOST_ALIGN_ASSUME_ALIGNED(p, BlockSize::align);
 
         const size_type kc = B ().size1();
         const size_type nc = B ().size2();
@@ -262,9 +271,9 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
     template <typename E1, typename E2, typename E3, typename BlockSize>
     void
     gemm(typename E3::value_type alpha, const matrix_expression<E1> &e1,
-        const matrix_expression<E2> &e2,
+         const matrix_expression<E2> &e2,
          typename E3::value_type beta, matrix_expression<E3> &e3,
-        BlockSize)
+         BlockSize)
     {
         typedef typename E3::size_type  size_type;
         typedef typename E1::value_type value_type1;
@@ -276,7 +285,7 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         static const size_type NC = BlockSize::nc;
 
         const size_type m = BOOST_UBLAS_SAME (e3 ().size1 (), e1 ().size1 ());
-       const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ());
+        const size_type n = BOOST_UBLAS_SAME (e3 ().size2 (), e2 ().size2 ());
         const size_type k = BOOST_UBLAS_SAME (e1 ().size2 (), e2 ().size1 ());
 
         static const size_type KC = BlockSize::kc;
@@ -291,11 +300,11 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
         const size_type incRowC = &e3()(1,0) - &e3()(0,0);
         const size_type incColC = &e3()(0,1) - &e3()(0,0);
         value_type_i *A =
-            static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align,
-                                                                       sizeof(value_type_i) * MC*KC));
+            static_cast<value_type_i *>(::boost::alignment::aligned_alloc(BlockSize::align,
+                                                                        sizeof(value_type_i) * MC*KC));
         value_type_i *B =
-            static_cast<value_type_i *>(boost::alignment::aligned_alloc(BlockSize::align,
-                                                                     sizeof(value_type_i) * KC*NC));
+            static_cast<value_type_i *>(::boost::alignment::aligned_alloc(BlockSize::align,
+                                                                      sizeof(value_type_i) * KC*NC));
 
         if (alpha==value_type3(0) || k==0) {
             gescal(beta, e3);
@@ -319,13 +328,13 @@ namespace boost { namespace numeric { namespace ublas { namespace detail {
                     pack_A<matrix_range<const E1>, value_type_i, BlockSize>(As, A);
 
                     mgemm<size_type, value_type_i, value_type3, BlockSize>(mc, nc, kc, alpha, A, B, beta_,
-                         &C_[i*MC*incRowC+j*NC*incColC],
-                         incRowC, incColC);
+                          &C_[i*MC*incRowC+j*NC*incColC],
+                          incRowC, incColC);
                 }
             }
         }
-       boost::alignment::aligned_free(A);
-       boost::alignment::aligned_free(B);
+        ::boost::alignment::aligned_free(A);
+        ::boost::alignment::aligned_free(B);
     }
 }}}}
 #endif
diff --git a/include/boost/numeric/ublas/detail/vector.hpp b/include/boost/numeric/ublas/detail/vector.hpp
new file mode 100644
index 0000000..1681eda
--- /dev/null
+++ b/include/boost/numeric/ublas/detail/vector.hpp
@@ -0,0 +1,27 @@
+//
+//  Copyright (c) 2016
+//  Imre Palik
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef _BOOST_UBLAS_DETAIL_VECTOR_
+#define _BOOST_UBLAS_DETAIL_VECTOR_
+#include <boost/predef/hardware/simd.h>
+
+#if defined(BOOST_HW_SIMD) && (BOOST_HW_SIMD == BOOST_HW_SIMD_X86 || BOOST_HW_SIMD == BOOST_HW_SIMD_X86_AMD)
+#if BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_AVX_VERSION
+#define _BOOST_UBLAS_VECTOR_SIZE 32
+#elif BOOST_HW_SIMD >= BOOST_HW_SIMD_X86_SSE_VERSION
+#define _BOOST_UBLAS_VECTOR_SIZE 16
+#else
+#define _BOOST_UBLAS_VECTOR_SIZE 8
+#endif
+#endif
+
+#ifndef _BOOST_UBLAS_VECTOR_SIZE
+#define _BOOST_UBLAS_VECTOR_SIZE 1
+#endif
+
+#endif
diff --git a/include/boost/numeric/ublas/matrix_expression.hpp b/include/boost/numeric/ublas/matrix_expression.hpp
index 78daf2b..c4e0ac8 100644
--- a/include/boost/numeric/ublas/matrix_expression.hpp
+++ b/include/boost/numeric/ublas/matrix_expression.hpp
@@ -15,6 +15,7 @@
 
 #include <boost/numeric/ublas/vector_expression.hpp>
 #include <boost/numeric/ublas/detail/gemm.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
 
 // Expression templates based on ideas of Todd Veldhuizen and Geoffrey Furnish
 // Iterators based on ideas of Jeremy Siek
diff --git a/include/boost/numeric/ublas/operation.hpp b/include/boost/numeric/ublas/operation.hpp
index 393a9bd..8fd3b63 100644
--- a/include/boost/numeric/ublas/operation.hpp
+++ b/include/boost/numeric/ublas/operation.hpp
@@ -15,6 +15,7 @@
 
 #include <boost/numeric/ublas/matrix_proxy.hpp>
 #include <boost/numeric/ublas/detail/gemm.hpp>
+#include <boost/numeric/ublas/detail/block_sizes.hpp>
 /** \file operation.hpp
  *  \brief This file contains some specialized products.
  */
-- 
1.9.1