Thanks, for pointing this out.  You are right.

When stripping out too many special cases from the micro-kernels regarding matrix C (like C is
row/col major, aligned/not-aligned) and alpha=1 I also removed the case for beta=0.  At the moment
I regenerate the websites … and the tar-files

 


On 02 Feb 2016, at 16:32, Karl Rupp <rupp@iue.tuwien.ac.at> wrote:

Hi guys,

I'm sorry to interrupt your performance tuning fun, but please note that the kernel does not work correctly. To verify, consider the case that beta is equal to zero and that C initially consists of NaNs (which may happen in practice if you start if a freshly malloc'ed buffer)...

Best regards,
Karli




On 01/30/2016 05:48 PM, Michael Lehn wrote:
Here the same in slightly more general form

template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                     && BlockSize<double>::MR==4
                     && BlockSize<double>::NR==12
                     && BlockSize<double>::align==32,
         void>::type
ugemm(Index kc, double alpha,
      const double *A, const double *B,
      double beta,
      double *C, Index incRowC, Index incColC)
{
    static const Index MR = BlockSize<double>::MR;
    static const Index NR = BlockSize<double>::NR/4;
    typedef double vx __attribute__((vector_size (4*sizeof(double))));

    A = (const double*) __builtin_assume_aligned (A, 32);
    B = (const double*) __builtin_assume_aligned (B, 32);

    vx P[MR*NR] = {};

    for (Index l=0; l<kc; ++l) {
        const vx *b = (const vx *)B;

        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] += A[i]*b[j];
            }
        }
        A += MR;
        B += NR*4;
    }

    if (alpha!=double(1)) {
        for (Index i=0; i<MR; ++i) {
            for (Index j=0; j<NR; ++j) {
                P[i*NR+j] *= alpha;
            }
        }
    }

    for (Index i=0; i<MR; ++i) {
        for (Index j=0; j<NR; ++j) {
            const double *p = (const double *) &P[i*NR+j];
            for (Index j1=0; j1<4; ++j1) {
                C[i*incRowC+(j*4+j1)*incColC] *= beta;
                C[i*incRowC+(j*4+j1)*incColC] += p[j1];
            }
        }
    }
}

On 30 Jan 2016, at 13:49, Michael Lehn <michael.lehn@uni-ulm.de> wrote:

Ok, and this version for MR=4, NR=12 even beats the asm kernel on Haswell:

//-- Micro Kernel --------------------------------------------------------------
template <typename Index>
typename std::enable_if<std::is_convertible<Index, std::int64_t>::value
                    && BlockSize<double>::MR==4
                    && BlockSize<double>::NR==12
                    && BlockSize<double>::align==32,
        void>::type
ugemm(Index kc, double alpha,
     const double *A, const double *B,
     double beta,
     double *C, Index incRowC, Index incColC)
{
   static const Index MR = BlockSize<double>::MR;
   static const Index NR = BlockSize<double>::NR;
   typedef double vx __attribute__((vector_size (4*sizeof(double))));

   A = (const double*) __builtin_assume_aligned (A, 32);
   B = (const double*) __builtin_assume_aligned (B, 32);

   vx P0_03 = {}; vx P0_47 = {}; vx P0_811 = {};
   vx P1_03 = {}; vx P1_47 = {}; vx P1_811 = {};
   vx P2_03 = {}; vx P2_47 = {}; vx P2_811 = {};
   vx P3_03 = {}; vx P3_47 = {}; vx P3_811 = {};

   for (Index l=0; l<kc; ++l) {
       const vx *b = (const vx *)B;

       P0_03 += A[0]*b[0]; P0_47 += A[0]*b[1]; P0_811 += A[0]*b[2];
       P1_03 += A[1]*b[0]; P1_47 += A[1]*b[1]; P1_811 += A[1]*b[2];
       P2_03 += A[2]*b[0]; P2_47 += A[2]*b[1]; P2_811 += A[2]*b[2];
       P3_03 += A[3]*b[0]; P3_47 += A[3]*b[1]; P3_811 += A[3]*b[2];
       A += MR;
       B += NR;
   }

   P0_03 *= alpha;  P0_47 *= alpha; P0_811 *= alpha;
   P1_03 *= alpha;  P1_47 *= alpha; P1_811 *= alpha;
   P2_03 *= alpha;  P2_47 *= alpha; P2_811 *= alpha;
   P3_03 *= alpha;  P3_47 *= alpha; P3_811 *= alpha;

   if (beta!=double(1)) {
       for (Index i=0; i<MR; ++i) {
           for (Index j=0; j<NR; ++j) {
               C[i*incRowC+j*incColC] *= beta;
           }
       }
   }

   const double *p = (const double *) &P0_03;
   C[0*incRowC+0*incColC] += p[0];
   C[0*incRowC+1*incColC] += p[1];
   C[0*incRowC+2*incColC] += p[2];
   C[0*incRowC+3*incColC] += p[3];

   p = (const double *) &P0_47;
   C[0*incRowC+4*incColC] += p[0];
   C[0*incRowC+5*incColC] += p[1];
   C[0*incRowC+6*incColC] += p[2];
   C[0*incRowC+7*incColC] += p[3];

   p = (const double *) &P0_811;
   C[0*incRowC+8*incColC] += p[0];
   C[0*incRowC+9*incColC] += p[1];
   C[0*incRowC+10*incColC] += p[2];
   C[0*incRowC+11*incColC] += p[3];

   p = (const double *) &P1_03;
   C[1*incRowC+0*incColC] += p[0];
   C[1*incRowC+1*incColC] += p[1];
   C[1*incRowC+2*incColC] += p[2];
   C[1*incRowC+3*incColC] += p[3];

   p = (const double *) &P1_47;
   C[1*incRowC+4*incColC] += p[0];
   C[1*incRowC+5*incColC] += p[1];
   C[1*incRowC+6*incColC] += p[2];
   C[1*incRowC+7*incColC] += p[3];

   p = (const double *) &P1_811;
   C[1*incRowC+8*incColC] += p[0];
   C[1*incRowC+9*incColC] += p[1];
   C[1*incRowC+10*incColC] += p[2];
   C[1*incRowC+11*incColC] += p[3];

   p = (const double *) &P2_03;
   C[2*incRowC+0*incColC] += p[0];
   C[2*incRowC+1*incColC] += p[1];
   C[2*incRowC+2*incColC] += p[2];
   C[2*incRowC+3*incColC] += p[3];

   p = (const double *) &P2_47;
   C[2*incRowC+4*incColC] += p[0];
   C[2*incRowC+5*incColC] += p[1];
   C[2*incRowC+6*incColC] += p[2];
   C[2*incRowC+7*incColC] += p[3];

   p = (const double *) &P2_811;
   C[2*incRowC+8*incColC] += p[0];
   C[2*incRowC+9*incColC] += p[1];
   C[2*incRowC+10*incColC] += p[2];
   C[2*incRowC+11*incColC] += p[3];

   p = (const double *) &P3_03;
   C[3*incRowC+0*incColC] += p[0];
   C[3*incRowC+1*incColC] += p[1];
   C[3*incRowC+2*incColC] += p[2];
   C[3*incRowC+3*incColC] += p[3];

   p = (const double *) &P3_47;
   C[3*incRowC+4*incColC] += p[0];
   C[3*incRowC+5*incColC] += p[1];
   C[3*incRowC+6*incColC] += p[2];
   C[3*incRowC+7*incColC] += p[3];

   p = (const double *) &P3_811;
   C[3*incRowC+8*incColC] += p[0];
   C[3*incRowC+9*incColC] += p[1];
   C[3*incRowC+10*incColC] += p[2];
   C[3*incRowC+11*incColC] += p[3];

}

_______________________________________________
ublas mailing list
ublas@lists.boost.org
http://lists.boost.org/mailman/listinfo.cgi/ublas
Sent to: rupp@iue.tuwien.ac.at


_______________________________________________
ublas mailing list
ublas@lists.boost.org
http://lists.boost.org/mailman/listinfo.cgi/ublas
Sent to: michael.lehn@uni-ulm.de