|
Ublas : |
Subject: Re: [ublas] Matrix multiplication performance
From: palik imre (imre_palik_at_[hidden])
Date: 2016-02-03 14:53:45
Hi Michael,
first of all, the link on your page to test_ublas.tgz is still pointing to session4.tgz
I also realised, that we start to become memory bound.
I am getting approximately 10% improvement with the following setup on my old AMD box:
template <typename Index, typename T>
typename std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc, T alpha,
const T *A, const T *B,
T beta,
T *C, Index incRowC, Index incColC)
{
A = (const T*) __builtin_assume_aligned (A, 128);
B = (const T*) __builtin_assume_aligned (B, 128);
static const unsigned div = 32/sizeof(T);
static const Index MR = BlockSize<T>::MR;
static const Index NR = BlockSize<T>::NR/div;
typedef T vx __attribute__((vector_size (32)));
vx P[MR*NR] __attribute__ ((aligned (128))) = {};
const vx *B_ = (const vx *)B;
for (Index l=0; l<kc; ++l) {
if (!(kc&3))
__builtin_prefetch(A + 16, 0);
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<(NR); ++j) {
P[i * NR + j] += A[i]*B_[j];
}
}
A += MR;
B_ += NR;
}
T *P_ = (T *)P;
for (Index j=0; j<NR*div; ++j) {
for (Index i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];
}
}
}
template <typename Index, typename T, typename Beta, typename TC>
void
mgemm(Index mc, Index nc, Index kc,
T alpha,
const T *A, const T *B,
Beta beta,
TC *C, Index incRowC, Index incColC)
{
const Index MR = BlockSize<T>::MR;
const Index NR = BlockSize<T>::NR;
const Index mp = (mc+MR-1) / MR;
const Index np = (nc+NR-1) / NR;
const Index mr_ = mc % MR;
const Index nr_ = nc % NR;
#if defined(_OPENMP)
#pragma omp parallel for
#endif
for (Index j=0; j<np; ++j) {
const Index nr = (j!=np-1 || nr_==0) ? NR : nr_;
T C_[BlockSize<T>::MR*BlockSize<T>::NR];
__builtin_prefetch(B + j * kc * NR, 0);
__builtin_prefetch(B + j * kc * NR + 16, 0);
for (Index i=0; i<mp; ++i) {
const Index mr = (i!=mp-1 || mr_==0) ? MR : mr_;
__builtin_prefetch(A + i * kc * MR, 0);
if (mr==MR && nr==NR) {
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
} else {
std::fill_n(C_, MR*NR, T(0));
ugemm(kc, alpha,
&A[i*kc*MR], &B[j*kc*NR],
T(0),
C_, Index(1), MR);
gescal(mr, nr, beta,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
geaxpy(mr, nr, T(1), C_, Index(1), MR,
&C[i*MR*incRowC+j*NR*incColC],
incRowC, incColC);
}
}
}
}
MR=4, NR=8, KC=1024
Compile:
g++ -std=gnu++11 -Wall -W -mavx -Ofast -DNDEBUG -DM_MAX=1500 -g -o matprod5 matprod5.cc
Baseline:
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1
100 100 100 0.00172933 1156.52 0.00031314 6386.92 0
200 200 200 0.0051078 3132.46 0.00181692 8806.12 0
300 300 300 0.0166265 3247.83 0.00569728 9478.2 0
400 400 400 0.0409126 3128.62 0.0134713 9501.71 0
500 500 500 0.0908395 2752.11 0.0242485 10309.9 0
600 600 600 0.187145 2308.37 0.0376499 11474.1 0
700 700 700 0.306176 2240.54 0.0588034 11666 0
800 800 800 0.463066 2211.35 0.0878199 11660.2 0
900 900 900 0.867082 1681.5 0.125935 11577.4 0
1000 1000 1000 1.20465 1660.24 0.171337 11672.9 0
1100 1100 1100 1.63398 1629.15 0.230289 11559.4 3.76457e-19
1200 1200 1200 2.09053 1653.17 0.29553 11694.2 3.78417e-19
1300 1300 1300 2.672 1644.46 0.376785 11661.8 3.19717e-19
1400 1400 1400 3.32236 1651.84 0.461209 11899.2 2.57872e-19
1500 1500 1500 4.09372 1648.87 0.567471 11894.9 2.05188e-19
with prefetch:
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Diff nrm1
100 100 100 0.00197408 1013.13 0.000814521 2455.43 0
200 200 200 0.00571277 2800.74 0.00206576 7745.32 0
300 300 300 0.0170868 3160.33 0.00547799 9857.63 0
400 400 400 0.0401293 3189.69 0.0117615 10882.9 0
500 500 500 0.0888678 2813.17 0.0220389 11343.6 0
600 600 600 0.188413 2292.84 0.0361003 11966.7 0
700 700 700 0.31356 2187.78 0.0561651 12214 0
800 800 800 0.457264 2239.41 0.0836534 12241 0
900 900 900 0.869343 1677.13 0.117946 12361.6 0
1000 1000 1000 1.25295 1596.23 0.160194 12484.9 0
1100 1100 1100 1.62086 1642.34 0.220637 12065.1 3.75611e-19
1200 1200 1200 2.07918 1662.2 0.279095 12382.9 3.7818e-19
1300 1300 1300 2.66945 1646.03 0.352817 12454.1 3.19286e-19
1400 1400 1400 3.30731 1659.35 0.429694 12771.9 2.58137e-19
1500 1500 1500 4.09499 1648.36 0.527458 12797.2 2.05e-19
--------------------------------------------
On Tue, 2/2/16, Michael Lehn <michael.lehn_at_[hidden]> wrote:
Subject: Re: [ublas] Matrix multiplication performance
To: "ublas mailing list" <ublas_at_[hidden]>
Cc: "palik imre" <imre_palik_at_[hidden]>
Date: Tuesday, 2 February, 2016, 23:26
I also
started to write down some notes on the rational behind the
micro-kernel:
http://www.mathematik.uni-ulm.de/~lehn/test_ublas/session6/page01.html
On 02 Feb 2016, at
21:43, Michael Lehn <michael.lehn_at_[hidden]>
wrote:
(So the pages are re-done
â¦)
The bug
is because NR is BlockSize<T>::NR divided by the
vector length. Â So thisshould work
//---------------------------------------------------------------------------------template
<typename Index, typename T>typename
std::enable_if<std::is_floating_point<T>::value,void>::typeugemm(Index
kc, T alpha,        const T *A,
const T *B,   T beta,   T
*C, Index incRowC, Index incColC){Â
 A = (const T*) __builtin_assume_aligned (A,
128);Â Â B = (const T*) __builtin_assume_aligned
(B, 128);Â Â static const unsigned div =
32/sizeof(T);Â Â static const Index MR =
BlockSize<T>::MR;Â Â static const Index NR
= BlockSize<T>::NR/div;
  typedef T vx
__attribute__((vector_size (32)));
  vx P[MR*NR] __attribute__
((aligned (128))) = {};Â Â const vx *B_ = (vx
*)B;Â Â for (Index l=0; l<kc; ++l)
{Â Â Â Â for (Index i=0; i<MR; ++i)
{Â Â Â Â Â Â for (Index j=0; j<(NR); ++j)
{Â Â Â Â Â Â Â Â P[i * NR + j] +=
A[i]*B_[j];Â Â Â Â Â Â }Â Â Â
 }    A += MR;    B_ +=
NR;Â Â }
  T *P_ = (T *)P;Â
 for (Index j=0; j<NR*div; ++j) {  Â
 for (Index i=0; i<MR; ++i) {    Â
 C[i*incRowC+j*incColC] *= beta;    Â
 C[i*incRowC+j*incColC] += alpha*P_[i * NR * div +
j];Â Â Â Â }Â
 }}//---------------------------------------------------------------------------------
However, as Karl pointed out it also
should treat the case beta==0as a special
case.
But I donât
get any speedup. Â For the performance relevant
ismainly the loop
  for (Index l=0; l<kc;
++l) {Â Â Â Â for (Index i=0; i<MR; ++i)
{Â Â Â Â Â Â for (Index j=0; j<(NR); ++j)
{Â Â Â Â Â Â Â Â P[i * NR + j] +=
A[i]*B_[j];Â Â Â Â Â Â }Â Â Â
 }    A += MR;    B_ +=
NR;Â Â }
From my experience, only if this is
optimized to the extrem a little bitextra
performance can be achieved by treating some special cases
forthe update
  T *P_ = (T
*)P;Â Â for (Index j=0; j<NR*div; ++j)
{Â Â Â Â for (Index i=0; i<MR; ++i)
{Â Â Â Â Â Â C[i*incRowC+j*incColC] *=
beta;Â Â Â Â Â Â C[i*incRowC+j*incColC] +=
alpha*P_[i * NR * div + j];Â Â Â
 }  }
The main reason my code looks weird
is that I also want to use itfor teaching. Â The
way it is coded you easily can map it forth
andback with the assembly code.
new
kernel:
template
<typename Index, typename T>
typename
std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc,
T alpha,
               const
T *A, const T *B,
     T beta,
     T *C, Index incRowC, Index
incColC)
{
   A = (const
T*) __builtin_assume_aligned (A, 128);
   B = (const T*) __builtin_assume_aligned
(B, 128);
   static const unsigned div =
32/sizeof(T);
   static const Index MR =
BlockSize<T>::MR;
   static const
Index NR = BlockSize<T>::NR/div;
   typedef T vx __attribute__((vector_size
(32)));
   vx P[MR*NR]
__attribute__ ((aligned (128))) = {};
   const vx *B_ = (vx *)B;
   for (Index l=0; l<kc; ++l) {
       for (Index i=0; i<MR; ++i)
{
           for (Index j=0;
j<(NR); ++j) {
               P[i * NR + j] +=
A[i]*B_[j];
           }
       }
       A
+= MR;
       B_ += NR;
   }
   T *P_ = (T *)P;
   for
(Index j=0; j<NR; ++j) {
       for (Index i=0; i<MR; ++i)
{
           C[i*incRowC+j*incColC] *=
beta;
           C[i*incRowC+j*incColC] +=
alpha*P_[i * NR + j];
       }
   }
}
_______________________________________________
ublas mailing list
ublas_at_[hidden]
http://lists.boost.org/mailman/listinfo.cgi/ublas
Sent to: michael.lehn_at_[hidden]