Boost logo

Ublas :

Subject: Re: [ublas] Matrix multiplication performance
From: Michael Lehn (michael.lehn_at_[hidden])
Date: 2016-01-30 07:49:43


Ok, and this version for MR=4, NR=12 even beats the asm kernel on Haswell:

//-- Micro Kernel --------------------------------------------------------------
template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                     && BlockSize<double>::MR==4
                     && BlockSize<double>::NR==12
                     && BlockSize<double>::align==32,
         void>::type
ugemm(Index kc, double alpha,
      const double *A, const double *B,
      double beta,
      double *C, Index incRowC, Index incColC)
{
    static const Index MR = BlockSize<double>::MR;
    static const Index NR = BlockSize<double>::NR;
    typedef double vx __attribute__((vector_size (4*sizeof(double))));

    A = (const double*) __builtin_assume_aligned (A, 32);
    B = (const double*) __builtin_assume_aligned (B, 32);

    vx P0_03 = {}; vx P0_47 = {}; vx P0_811 = {};
    vx P1_03 = {}; vx P1_47 = {}; vx P1_811 = {};
    vx P2_03 = {}; vx P2_47 = {}; vx P2_811 = {};
    vx P3_03 = {}; vx P3_47 = {}; vx P3_811 = {};

    for (Index l=0; l<kc; ++l) {
        const vx *b = (const vx *)B;

        P0_03 += A[0]*b[0]; P0_47 += A[0]*b[1]; P0_811 += A[0]*b[2];
        P1_03 += A[1]*b[0]; P1_47 += A[1]*b[1]; P1_811 += A[1]*b[2];
        P2_03 += A[2]*b[0]; P2_47 += A[2]*b[1]; P2_811 += A[2]*b[2];
        P3_03 += A[3]*b[0]; P3_47 += A[3]*b[1]; P3_811 += A[3]*b[2];
        A += MR;
        B += NR;
    }

    P0_03 *= alpha; P0_47 *= alpha; P0_811 *= alpha;
    P1_03 *= alpha; P1_47 *= alpha; P1_811 *= alpha;
    P2_03 *= alpha; P2_47 *= alpha; P2_811 *= alpha;
    P3_03 *= alpha; P3_47 *= alpha; P3_811 *= alpha;

    if (beta!=double(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                C[i*incRowC+j*incColC] *= beta;
            }
        }
    }

    const double *p = (const double *) &P0_03;
    C[0*incRowC+0*incColC] += p[0];
    C[0*incRowC+1*incColC] += p[1];
    C[0*incRowC+2*incColC] += p[2];
    C[0*incRowC+3*incColC] += p[3];

    p = (const double *) &P0_47;
    C[0*incRowC+4*incColC] += p[0];
    C[0*incRowC+5*incColC] += p[1];
    C[0*incRowC+6*incColC] += p[2];
    C[0*incRowC+7*incColC] += p[3];

    p = (const double *) &P0_811;
    C[0*incRowC+8*incColC] += p[0];
    C[0*incRowC+9*incColC] += p[1];
    C[0*incRowC+10*incColC] += p[2];
    C[0*incRowC+11*incColC] += p[3];

    p = (const double *) &P1_03;
    C[1*incRowC+0*incColC] += p[0];
    C[1*incRowC+1*incColC] += p[1];
    C[1*incRowC+2*incColC] += p[2];
    C[1*incRowC+3*incColC] += p[3];

    p = (const double *) &P1_47;
    C[1*incRowC+4*incColC] += p[0];
    C[1*incRowC+5*incColC] += p[1];
    C[1*incRowC+6*incColC] += p[2];
    C[1*incRowC+7*incColC] += p[3];

    p = (const double *) &P1_811;
    C[1*incRowC+8*incColC] += p[0];
    C[1*incRowC+9*incColC] += p[1];
    C[1*incRowC+10*incColC] += p[2];
    C[1*incRowC+11*incColC] += p[3];

    p = (const double *) &P2_03;
    C[2*incRowC+0*incColC] += p[0];
    C[2*incRowC+1*incColC] += p[1];
    C[2*incRowC+2*incColC] += p[2];
    C[2*incRowC+3*incColC] += p[3];

    p = (const double *) &P2_47;
    C[2*incRowC+4*incColC] += p[0];
    C[2*incRowC+5*incColC] += p[1];
    C[2*incRowC+6*incColC] += p[2];
    C[2*incRowC+7*incColC] += p[3];

    p = (const double *) &P2_811;
    C[2*incRowC+8*incColC] += p[0];
    C[2*incRowC+9*incColC] += p[1];
    C[2*incRowC+10*incColC] += p[2];
    C[2*incRowC+11*incColC] += p[3];

    p = (const double *) &P3_03;
    C[3*incRowC+0*incColC] += p[0];
    C[3*incRowC+1*incColC] += p[1];
    C[3*incRowC+2*incColC] += p[2];
    C[3*incRowC+3*incColC] += p[3];

    p = (const double *) &P3_47;
    C[3*incRowC+4*incColC] += p[0];
    C[3*incRowC+5*incColC] += p[1];
    C[3*incRowC+6*incColC] += p[2];
    C[3*incRowC+7*incColC] += p[3];

    p = (const double *) &P3_811;
    C[3*incRowC+8*incColC] += p[0];
    C[3*incRowC+9*incColC] += p[1];
    C[3*incRowC+10*incColC] += p[2];
    C[3*incRowC+11*incColC] += p[3];

}

[lehn_at_node042 session5]$ g++ -mfma -Wall -Ofast -I ../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DNDEBUG -DBS_D_NR=12 -DBS_D_NC=4092 matprod.cc
[lehn_at_node042 session5]$ ./a.out
# m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Res
  100 100 100 0.0010737 1862.72 0.00035514 5631.58 0
  200 200 200 0.00450262 3553.49 0.0015222 10511.1 0
  300 300 300 0.0149401 3614.43 0.00197623 27324.7 1.91339e-16
  400 400 400 0.0306213 4180.1 0.00441727 28977.2 8.30501e-17
  500 500 500 0.0521503 4793.83 0.00755924 33072.1 3.47577e-17
  600 600 600 0.0829816 5205.97 0.0124722 34637.1 1.60972e-17
  700 700 700 0.129317 5304.79 0.0192483 35639.4 8.37943e-18
  800 800 800 0.192606 5316.55 0.0296521 34533.8 4.66971e-18
  900 900 900 0.274752 5306.61 0.0404913 36007.7 2.77631e-18
 1000 1000 1000 0.379118 5275.41 0.054979 36377.5 1.74802e-18
 1100 1100 1100 0.500672 5316.85 0.0742024 35874.8 1.141e-18
 1200 1200 1200 0.646887 5342.51 0.0927714 37252.9 7.7394e-19
 1300 1300 1300 0.823663 5334.7 0.119398 36801.2 5.41738e-19
 1400 1400 1400 1.02766 5340.3 0.147005 37332.1 3.88125e-19
 1500 1500 1500 1.26277 5345.39 0.176592 38223.6 2.85178e-19
 1600 1600 1600 1.59651 5131.18 0.22395 36579.7 2.12873e-19
 1700 1700 1700 2.66806 3682.82 0.259462 37870.7 1.62096e-19
 1800 1800 1800 3.34888 3482.96 0.311059 37497.7 1.25272e-19
 1900 1900 1900 4.09763 3347.79 0.371178 36958 9.81479e-20

On 30 Jan 2016, at 13:33, Michael Lehn <michael.lehn_at_[hidden]> wrote:

> On my Haswell your code performs very well. For N=M=K it reaches single threaded 32GFLOPS
> compared to 37GFLOPS with my FMA kernel. So I would say this is close!
>
> I did the flowing two things:
>
> 1) Compiled with “-mfma”
> 2) Used MR=4 NR=8 and the other defaults
>
> Here the results:
>
> (A) Your code
>
> [lehn_at_node042 session5]$ g++ -mfma -Wall -Ofast -I ../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DNDEBUG matprod.cc
> [lehn_at_node042 session5]$ ./a.out
> # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Res
> 100 100 100 0.000993595 2012.89 0.000402098 4973.91 0
> 200 200 200 0.00508121 3148.86 0.0016022 9986.3 0
> 300 300 300 0.0148755 3630.13 0.00250342 21570.5 1.92273e-16
> 400 400 400 0.0304433 4204.53 0.00562129 22770.6 8.3039e-17
> 500 500 500 0.0515749 4847.32 0.00889942 28091.7 3.47892e-17
> 600 600 600 0.0821662 5257.63 0.0144308 29936 1.6149e-17
> 700 700 700 0.128187 5351.55 0.0224022 30622 8.38824e-18
> 800 800 800 0.190724 5369.01 0.0340685 30057.1 4.69155e-18
> 900 900 900 0.272908 5342.46 0.0474595 30720.9 2.78366e-18
> 1000 1000 1000 0.375997 5319.19 0.0635467 31472.9 1.75117e-18
> 1100 1100 1100 0.504189 5279.76 0.0856357 31085.2 1.14159e-18
> 1200 1200 1200 0.646057 5349.37 0.110654 31232.5 7.74152e-19
> 1300 1300 1300 0.828805 5301.61 0.137722 31904.8 5.43016e-19
> 1400 1400 1400 1.0273 5342.18 0.171288 32039.6 3.87778e-19
> 1500 1500 1500 1.26946 5317.24 0.208467 32379.3 2.85337e-19
> 1600 1600 1600 1.58969 5153.21 0.264495 30972.3 2.13061e-19
> 1700 1700 1700 2.67863 3668.3 0.30419 32302.2 1.62006e-19
> ^C
>
> (B) My FMA Kernel
>
> [lehn_at_node042 session5]$ g++ -mfma -Wall -Ofast -I ../boost_1_60_0/ -std=c++11 -DHAVE_FMA -DNDEBUG matprod.cc
> [lehn_at_node042 session5]$ ./a.out
> # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Res
> 100 100 100 0.0010306 1940.61 0.000378977 5277.37 7.66934
> 200 200 200 0.0049895 3206.74 0.00101729 15728.1 0.236755
> 300 300 300 0.0150318 3592.39 0.00294536 18333.9 0
> 400 400 400 0.0306356 4178.14 0.00495991 25806.9 0.00179347
> 500 500 500 0.052443 4767.08 0.00784496 31867.6 0.000927111
> 600 600 600 0.0824758 5237.9 0.0127618 33850.9 8.49298e-18
> 700 700 700 0.128639 5332.73 0.0192815 35578.1 0.0107258
> 800 800 800 0.192102 5330.51 0.0294848 34729.7 0.00548943
> 900 900 900 0.272509 5350.28 0.0407217 35804 2.34506e-18
> 1000 1000 1000 0.372414 5370.37 0.0557306 35886.9 0.00180032
> 1100 1100 1100 0.495172 5375.91 0.0759043 35070.5 0.00111911
> 1200 1200 1200 0.643628 5369.56 0.0936622 36898.5 7.13231e-19
> 1300 1300 1300 0.831523 5284.28 0.119804 36676.5 0.000485489
> 1400 1400 1400 1.07165 5121.07 0.148407 36979.5 0.000334856
> 1500 1500 1500 1.26523 5334.99 0.179347 37636.6 2.74426e-19
> 1600 1600 1600 1.57777 5192.13 0.224779 36444.7 0.00017181
> 1700 1700 1700 2.67335 3675.54 0.259066 37928.6 0.000126808
> 1800 1800 1800 3.35908 3472.38 0.305459 38185.1 1.22716e-19
> ^C
>
> (C) Your code with the parameters you suggested.
>
> [lehn_at_node042 session5]$ g++ -mfma -Wall -Ofast -I ../boost_1_60_0/ -std=c++11 -DHAVE_GCCVEC -DBS_D_MR=2 -DBS_D_NR=16 -DBS_D_KC=512 -DNDEBUG matprod.cc
> [lehn_at_node042 session5]$ ./a.out
> # m n k uBLAS: t1 MFLOPS Blocked: t2 MFLOPS Res
> 100 100 100 0.000991205 2017.75 0.000371863 5378.32 0
> 200 200 200 0.00492544 3248.44 0.000893081 17915.5 0
> 300 300 300 0.0149147 3620.6 0.00288725 18702.9 0
> 400 400 400 0.0304561 4202.77 0.00533248 24003.9 0
> 500 500 500 0.0519717 4810.31 0.0094423 26476.6 0
> 600 600 600 0.0815292 5298.72 0.0160959 26839.2 8.50004e-18
> 700 700 700 0.127243 5391.25 0.0241644 28388.9 5.82176e-18
> 800 800 800 0.190188 5384.16 0.0379423 26988.3 3.67296e-18
> 900 900 900 0.270269 5394.62 0.0526334 27701.1 2.34234e-18
> 1000 1000 1000 0.369451 5413.44 0.0710608 28144.9 1.53537e-18
> 1100 1100 1100 0.491269 5418.62 0.0953915 27906 1.03025e-18
> 1200 1200 1200 0.638369 5413.8 0.119802 28847.5 7.13779e-19
> 1300 1300 1300 0.822215 5344.1 0.15489 28368.5 5.07376e-19
> 1400 1400 1400 1.03643 5295.12 0.191441 28666.8 3.69364e-19
> 1500 1500 1500 1.26753 5325.32 0.231101 29208 2.74895e-19
> 1600 1600 1600 1.60965 5089.31 0.289249 28321.6 2.0631e-19
> 1700 1700 1700 2.67171 3677.79 0.333099 29498.7 1.57713e-19
> 1800 1800 1800 3.34254 3489.56 0.397267 29360.6 1.2262e-19
> ^C
> [lehn_at_node042 session5]$
>
>
>
> Cheers,
>
> Michael
>
>
> On 29 Jan 2016, at 23:31, palik imre <imre_palik_at_[hidden]> wrote:
>
>> Here is a more sane kernel using gcc simd vectors:
>>
>> template <typename Index, typename T>
>> typename std::enable_if<std::is_floating_point<T>::value,
>> void>::type
>> ugemm(Index kc, T alpha,
>> const T *A, const T *B,
>> T beta,
>> T *C, Index incRowC, Index incColC)
>> {
>> static const Index MR = BlockSize<T>::MR;
>> static const Index NR = BlockSize<T>::NR;
>> static const unsigned div = 32/sizeof(T);
>> typedef T vx __attribute__((vector_size (32)));
>>
>> vx P[MR*NR/div] __attribute__ ((aligned (128))) = {};
>> const vx *B_ = (vx *)B;
>> for (Index l=0; l<kc; ++l) {
>> for (Index j=0; j<(NR/div); ++j) {
>> for (Index i=0; i<MR; ++i) {
>> P[i * NR/div + j] += A[i + l * MR]*B_[l*(NR/div)+j];
>> }
>> }
>> }
>>
>> T *P_ = (T *)P;
>> for (Index i=0; i<MR; ++i) {
>> for (Index j=0; j<NR; ++j) {
>> C[i*incRowC+j*incColC] *= beta;
>> C[i*incRowC+j*incColC] += alpha*P_[i * NR + j];
>> }
>> }
>> }
>>
>>
>> this plugs nicely to Michael's code, and slightly faster than my previous try (but still nowhere near to Michael's hand crafted kernel).
>>
>> The good blocksize for double is MR=2, NR=16, KC=512
>> multiply MR and NR by 2 for float.
>>
>>
>> Cheers,
>>
>> Imre
>>
>
> _______________________________________________
> ublas mailing list
> ublas_at_[hidden]
> http://lists.boost.org/mailman/listinfo.cgi/ublas
> Sent to: michael.lehn_at_[hidden]
>