Boost logo

Ublas :

Subject: Re: [ublas] Matrix multiplication performance
From: palik imre (imre_palik_at_[hidden])
Date: 2016-02-03 14:53:45


Hi Michael,

first of all, the link on your page to test_ublas.tgz is still pointing to session4.tgz

I also realised, that we start to become memory bound.
I am getting approximately 10% improvement with the following setup on my old AMD box:

template <typename Index, typename T>
typename std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc, T alpha,
               const T *A, const T *B,
     T beta,
     T *C, Index incRowC, Index incColC)
{
   A = (const T*) __builtin_assume_aligned (A, 128);
   B = (const T*) __builtin_assume_aligned (B, 128);
   
   static const unsigned div = 32/sizeof(T);
   static const Index MR = BlockSize<T>::MR;
   static const Index NR = BlockSize<T>::NR/div;

   typedef T vx __attribute__((vector_size (32)));

   vx P[MR*NR] __attribute__ ((aligned (128))) = {};
   const vx *B_ = (const vx *)B;
   for (Index l=0; l<kc; ++l) {
     if (!(kc&3))
           __builtin_prefetch(A + 16, 0);
       for (Index i=0; i<MR; ++i) {
           for (Index j=0; j<(NR); ++j) {
               P[i * NR + j] += A[i]*B_[j];
           }
       }
       A += MR;
       B_ += NR;
   }

   T *P_ = (T *)P;
   for (Index j=0; j<NR*div; ++j) {
       for (Index i=0; i<MR; ++i) {
           C[i*incRowC+j*incColC] *= beta;
           C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];
       }
   }
}

template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
      T alpha,
      const T *A, const T *B,
      Beta beta,
      TC *C, Index incRowC, Index incColC)
{
    const Index MR = BlockSize<T>::MR;
    const Index NR = BlockSize<T>::NR;
    const Index mp = (mc+MR-1) / MR;
    const Index np = (nc+NR-1) / NR;
    const Index mr_ = mc % MR;
    const Index nr_ = nc % NR;

    #if defined(_OPENMP)
    #pragma omp parallel for
    #endif
    for (Index j=0; j<np; ++j) {
        const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
        T C_[BlockSize<T>::MR*BlockSize<T>::NR];
        __builtin_prefetch(B + j * kc * NR, 0);
        __builtin_prefetch(B + j * kc * NR + 16, 0);
        for (Index i=0; i<mp; ++i) {
            const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
            __builtin_prefetch(A + i * kc * MR, 0);
            if (mr==MR && nr==NR) {
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      beta,
                      &C[i*MR*incRowC+j*NR*incColC],
                      incRowC, incColC);
            } else {
                std::fill_n(C_, MR*NR, T(0));
                ugemm(kc, alpha,
                      &A[i*kc*MR], &B[j*kc*NR],
                      T(0),
                      C_, Index(1), MR);
                gescal(mr, nr, beta,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
                geaxpy(mr, nr, T(1), C_, Index(1), MR,
                       &C[i*MR*incRowC+j*NR*incColC],
                       incRowC, incColC);
            }
        }
    }
}

MR=4, NR=8, KC=1024

Compile:
g++ -std=gnu++11 -Wall -W -mavx -Ofast -DNDEBUG -DM_MAX=1500 -g -o matprod5 matprod5.cc

Baseline:
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1
  100 100 100 0.00172933 1156.52 0.00031314 6386.92 0
  200 200 200 0.0051078 3132.46 0.00181692 8806.12 0
  300 300 300 0.0166265 3247.83 0.00569728 9478.2 0
  400 400 400 0.0409126 3128.62 0.0134713 9501.71 0
  500 500 500 0.0908395 2752.11 0.0242485 10309.9 0
  600 600 600 0.187145 2308.37 0.0376499 11474.1 0
  700 700 700 0.306176 2240.54 0.0588034 11666 0
  800 800 800 0.463066 2211.35 0.0878199 11660.2 0
  900 900 900 0.867082 1681.5 0.125935 11577.4 0
 1000 1000 1000 1.20465 1660.24 0.171337 11672.9 0
 1100 1100 1100 1.63398 1629.15 0.230289 11559.4 3.76457e-19
 1200 1200 1200 2.09053 1653.17 0.29553 11694.2 3.78417e-19
 1300 1300 1300 2.672 1644.46 0.376785 11661.8 3.19717e-19
 1400 1400 1400 3.32236 1651.84 0.461209 11899.2 2.57872e-19
 1500 1500 1500 4.09372 1648.87 0.567471 11894.9 2.05188e-19

with prefetch:
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1
  100 100 100 0.00197408 1013.13 0.000814521 2455.43 0
  200 200 200 0.00571277 2800.74 0.00206576 7745.32 0
  300 300 300 0.0170868 3160.33 0.00547799 9857.63 0
  400 400 400 0.0401293 3189.69 0.0117615 10882.9 0
  500 500 500 0.0888678 2813.17 0.0220389 11343.6 0
  600 600 600 0.188413 2292.84 0.0361003 11966.7 0
  700 700 700 0.31356 2187.78 0.0561651 12214 0
  800 800 800 0.457264 2239.41 0.0836534 12241 0
  900 900 900 0.869343 1677.13 0.117946 12361.6 0
 1000 1000 1000 1.25295 1596.23 0.160194 12484.9 0
 1100 1100 1100 1.62086 1642.34 0.220637 12065.1 3.75611e-19
 1200 1200 1200 2.07918 1662.2 0.279095 12382.9 3.7818e-19
 1300 1300 1300 2.66945 1646.03 0.352817 12454.1 3.19286e-19
 1400 1400 1400 3.30731 1659.35 0.429694 12771.9 2.58137e-19
 1500 1500 1500 4.09499 1648.36 0.527458 12797.2 2.05e-19

--------------------------------------------
On Tue, 2/2/16, Michael Lehn <michael.lehn_at_[hidden]> wrote:

 Subject: Re: [ublas] Matrix multiplication performance
 To: "ublas mailing list" <ublas_at_[hidden]>
 Cc: "palik imre" <imre_palik_at_[hidden]>
 Date: Tuesday, 2 February, 2016, 23:26
 
 I also
 started to write down some notes on the rational behind the
 micro-kernel:
 http://www.mathematik.uni-ulm.de/~lehn/test_ublas/session6/page01.html
 
 On 02 Feb 2016, at
 21:43, Michael Lehn <michael.lehn_at_[hidden]>
 wrote:
 (So the pages are re-done
 â€¦)
 
 
 The bug
 is because NR is BlockSize<T>::NR divided by the
 vector length.  So thisshould work
 //---------------------------------------------------------------------------------template
 <typename Index, typename T>typename
 std::enable_if<std::is_floating_point<T>::value,void>::typeugemm(Index
 kc, T alpha,               const T *A,
 const T *B,     T beta,     T
 *C, Index incRowC, Index incColC){ 
 Â A = (const T*) __builtin_assume_aligned (A,
 128);   B = (const T*) __builtin_assume_aligned
 (B, 128);   static const unsigned div =
 32/sizeof(T);   static const Index MR =
 BlockSize<T>::MR;   static const Index NR
 = BlockSize<T>::NR/div;
 Â   typedef T vx
 __attribute__((vector_size (32)));
 Â   vx P[MR*NR] __attribute__
 ((aligned (128))) = {};   const vx *B_ = (vx
 *)B;   for (Index l=0; l<kc; ++l)
 {       for (Index i=0; i<MR; ++i)
 {           for (Index j=0; j<(NR); ++j)
 {               P[i * NR + j] +=
 A[i]*B_[j];           }     
 Â }       A += MR;       B_ +=
 NR;   }
 Â   T *P_ = (T *)P; 
 Â for (Index j=0; j<NR*div; ++j) {     
 Â for (Index i=0; i<MR; ++i) {         
 Â C[i*incRowC+j*incColC] *= beta;         
 Â C[i*incRowC+j*incColC] += alpha*P_[i * NR * div +
 j];       } 
 Â }}//---------------------------------------------------------------------------------
 However, as Karl pointed out it also
 should treat the case beta==0as a special
 case.
 But I don’t
 get any speedup.  For the performance relevant
 ismainly the loop
 Â   for (Index l=0; l<kc;
 ++l) {       for (Index i=0; i<MR; ++i)
 {           for (Index j=0; j<(NR); ++j)
 {               P[i * NR + j] +=
 A[i]*B_[j];           }     
 Â }       A += MR;       B_ +=
 NR;   }
 From my experience, only if this is
 optimized to the extrem a little bitextra
 performance can be achieved by treating some special cases
 forthe update
 Â   T *P_ = (T
 *)P;   for (Index j=0; j<NR*div; ++j)
 {       for (Index i=0; i<MR; ++i)
 {           C[i*incRowC+j*incColC] *=
 beta;           C[i*incRowC+j*incColC] +=
 alpha*P_[i * NR * div + j];     
 Â }   }
 The main reason my code looks weird
 is that I also want to use itfor teaching.  The
 way it is coded you easily can map it forth
 andback with the assembly code.
 
 new
 kernel:
 
 template
 <typename Index, typename T>
 typename
 std::enable_if<std::is_floating_point<T>::value,
 void>::type
 ugemm(Index kc,
 T alpha,
 Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const
 T *A, const T *B,
 Â Â Â Â Â T beta,
 Â Â Â Â Â T *C, Index incRowC, Index
 incColC)
 {
 Â Â Â A = (const
 T*) __builtin_assume_aligned (A, 128);
 Â Â Â B = (const T*) __builtin_assume_aligned
 (B, 128);
 Â Â Â static const unsigned div =
 32/sizeof(T);
 Â Â Â static const Index MR =
 BlockSize<T>::MR;
 Â Â Â static const
 Index NR = BlockSize<T>::NR/div;
 
 Â Â Â typedef T vx __attribute__((vector_size
 (32)));
 
 Â Â Â vx P[MR*NR]
 __attribute__ ((aligned (128))) = {};
 Â Â Â const vx *B_ = (vx *)B;
 Â Â Â for (Index l=0; l<kc; ++l) {
 Â Â Â Â Â Â Â for (Index i=0; i<MR; ++i)
 {
 Â Â Â Â Â Â Â Â Â Â Â for (Index j=0;
 j<(NR); ++j) {
 Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â P[i * NR + j] +=
 A[i]*B_[j];
 Â Â Â Â Â Â Â Â Â Â Â }
 Â Â Â Â Â Â Â }
 Â Â Â Â Â Â Â A
 += MR;
 Â Â Â Â Â Â Â B_ += NR;
 Â Â Â }
 
 Â Â Â T *P_ = (T *)P;
 Â Â Â for
 (Index j=0; j<NR; ++j) {
 Â Â Â Â Â Â Â for (Index i=0; i<MR; ++i)
 {
 Â Â Â Â Â Â Â Â Â Â Â C[i*incRowC+j*incColC] *=
 beta;
 Â Â Â Â Â Â Â Â Â Â Â C[i*incRowC+j*incColC] +=
 alpha*P_[i * NR + j];
 Â Â Â Â Â Â Â }
 Â Â Â }
 }
 
 _______________________________________________
 ublas mailing list
 ublas_at_[hidden]
 http://lists.boost.org/mailman/listinfo.cgi/ublas
 Sent to: michael.lehn_at_[hidden]