|
Ublas : |
Subject: Re: [ublas] Matrix multiplication performance
From: Michael Lehn (michael.lehn_at_[hidden])
Date: 2016-01-30 08:03:22
Can you or someone confirm that on Haswell?
On 30 Jan 2016, at 13:49, Michael Lehn <michael.lehn_at_[hidden]> wrote:
> Ok, and this version for MR=4, NR=12 even beats the asm kernel on Haswell:
>
> //-- Micro Kernel --------------------------------------------------------------
> template <typename Index>
> typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
> && BlockSize<double>::MR==4
> && BlockSize<double>::NR==12
> && BlockSize<double>::align==32,
> void>::type
> ugemm(Index kc, double alpha,
> const double *A, const double *B,
> double beta,
> double *C, Index incRowC, Index incColC)
> {
> static const Index MR = BlockSize<double>::MR;
> static const Index NR = BlockSize<double>::NR;
> typedef double vx __attribute__((vector_size (4*sizeof(double))));
>
> A = (const double*) __builtin_assume_aligned (A, 32);
> B = (const double*) __builtin_assume_aligned (B, 32);
>
> vx P0_03 = {}; vx P0_47 = {}; vx P0_811 = {};
> vx P1_03 = {}; vx P1_47 = {}; vx P1_811 = {};
> vx P2_03 = {}; vx P2_47 = {}; vx P2_811 = {};
> vx P3_03 = {}; vx P3_47 = {}; vx P3_811 = {};
>
> for (Index l=0; l<kc; ++l) {
> const vx *b = (const vx *)B;
>
> P0_03 += A[0]*b[0]; P0_47 += A[0]*b[1]; P0_811 += A[0]*b[2];
> P1_03 += A[1]*b[0]; P1_47 += A[1]*b[1]; P1_811 += A[1]*b[2];
> P2_03 += A[2]*b[0]; P2_47 += A[2]*b[1]; P2_811 += A[2]*b[2];
> P3_03 += A[3]*b[0]; P3_47 += A[3]*b[1]; P3_811 += A[3]*b[2];
> A += MR;
> B += NR;
> }
>
> P0_03 *= alpha; P0_47 *= alpha; P0_811 *= alpha;
> P1_03 *= alpha; P1_47 *= alpha; P1_811 *= alpha;
> P2_03 *= alpha; P2_47 *= alpha; P2_811 *= alpha;
> P3_03 *= alpha; P3_47 *= alpha; P3_811 *= alpha;
>
> if (beta!=double(1)) {
> for (Index i=0; i<MR; ++i) {
> for (Index j=0; j<NR; ++j) {
> C[i*incRowC+j*incColC] *= beta;
> }
> }
> }
>
> const double *p = (const double *) &P0_03;
> C[0*incRowC+0*incColC] += p[0];
> C[0*incRowC+1*incColC] += p[1];
> C[0*incRowC+2*incColC] += p[2];
> C[0*incRowC+3*incColC] += p[3];
>
> p = (const double *) &P0_47;
> C[0*incRowC+4*incColC] += p[0];
> C[0*incRowC+5*incColC] += p[1];
> C[0*incRowC+6*incColC] += p[2];
> C[0*incRowC+7*incColC] += p[3];
>
> p = (const double *) &P0_811;
> C[0*incRowC+8*incColC] += p[0];
> C[0*incRowC+9*incColC] += p[1];
> C[0*incRowC+10*incColC] += p[2];
> C[0*incRowC+11*incColC] += p[3];
>
> p = (const double *) &P1_03;
> C[1*incRowC+0*incColC] += p[0];
> C[1*incRowC+1*incColC] += p[1];
> C[1*incRowC+2*incColC] += p[2];
> C[1*incRowC+3*incColC] += p[3];
>
> p = (const double *) &P1_47;
> C[1*incRowC+4*incColC] += p[0];
> C[1*incRowC+5*incColC] += p[1];
> C[1*incRowC+6*incColC] += p[2];
> C[1*incRowC+7*incColC] += p[3];
>
> p = (const double *) &P1_811;
> C[1*incRowC+8*incColC] += p[0];
> C[1*incRowC+9*incColC] += p[1];
> C[1*incRowC+10*incColC] += p[2];
> C[1*incRowC+11*incColC] += p[3];
>
> p = (const double *) &P2_03;
> C[2*incRowC+0*incColC] += p[0];
> C[2*incRowC+1*incColC] += p[1];
> C[2*incRowC+2*incColC] += p[2];
> C[2*incRowC+3*incColC] += p[3];
>
> p = (const double *) &P2_47;
> C[2*incRowC+4*incColC] += p[0];
> C[2*incRowC+5*incColC] += p[1];
> C[2*incRowC+6*incColC] += p[2];
> C[2*incRowC+7*incColC] += p[3];
>
> p = (const double *) &P2_811;
> C[2*incRowC+8*incColC] += p[0];
> C[2*incRowC+9*incColC] += p[1];
> C[2*incRowC+10*incColC] += p[2];
> C[2*incRowC+11*incColC] += p[3];
>
> p = (const double *) &P3_03;
> C[3*incRowC+0*incColC] += p[0];
> C[3*incRowC+1*incColC] += p[1];
> C[3*incRowC+2*incColC] += p[2];
> C[3*incRowC+3*incColC] += p[3];
>
> p = (const double *) &P3_47;
> C[3*incRowC+4*incColC] += p[0];
> C[3*incRowC+5*incColC] += p[1];
> C[3*incRowC+6*incColC] += p[2];
> C[3*incRowC+7*incColC] += p[3];
>
> p = (const double *) &P3_811;
> C[3*incRowC+8*incColC] += p[0];
> C[3*incRowC+9*incColC] += p[1];
> C[3*incRowC+10*incColC] += p[2];
> C[3*incRowC+11*incColC] += p[3];
>
> }
>
> [lehn_at_node042 session5]$ g++ -mfma -Wall -Ofast -I ../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DNDEBUG -DBS_D_NR=12 -DBS_D_NC=4092 matprod.cc
> [lehn_at_node042 session5]$ ./a.out
> # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Res
> 100 100 100 0.0010737 1862.72 0.00035514 5631.58 0
> 200 200 200 0.00450262 3553.49 0.0015222 10511.1 0
> 300 300 300 0.0149401 3614.43 0.00197623 27324.7 1.91339e-16
> 400 400 400 0.0306213 4180.1 0.00441727 28977.2 8.30501e-17
> 500 500 500 0.0521503 4793.83 0.00755924 33072.1 3.47577e-17
> 600 600 600 0.0829816 5205.97 0.0124722 34637.1 1.60972e-17
> 700 700 700 0.129317 5304.79 0.0192483 35639.4 8.37943e-18
> 800 800 800 0.192606 5316.55 0.0296521 34533.8 4.66971e-18
> 900 900 900 0.274752 5306.61 0.0404913 36007.7 2.77631e-18
> 1000 1000 1000 0.379118 5275.41 0.054979 36377.5 1.74802e-18
> 1100 1100 1100 0.500672 5316.85 0.0742024 35874.8 1.141e-18
> 1200 1200 1200 0.646887 5342.51 0.0927714 37252.9 7.7394e-19
> 1300 1300 1300 0.823663 5334.7 0.119398 36801.2 5.41738e-19
> 1400 1400 1400 1.02766 5340.3 0.147005 37332.1 3.88125e-19
> 1500 1500 1500 1.26277 5345.39 0.176592 38223.6 2.85178e-19
> 1600 1600 1600 1.59651 5131.18 0.22395 36579.7 2.12873e-19
> 1700 1700 1700 2.66806 3682.82 0.259462 37870.7 1.62096e-19
> 1800 1800 1800 3.34888 3482.96 0.311059 37497.7 1.25272e-19
> 1900 1900 1900 4.09763 3347.79 0.371178 36958 9.81479e-20