template <typename Index, typename T>

typename std::enable_if<std::is_floating_point<T>::value,

void>::type

ugemm(Index kc, T alpha,

const T *A, const T *B,

T beta,

T *C, Index incRowC, Index incColC)

{

A = (const T*) __builtin_assume_aligned (A, 128);

B = (const T*) __builtin_assume_aligned (B, 128);

static const unsigned div = 32/sizeof(T);

static const Index MR = BlockSize<T>::MR;

static const Index NR = BlockSize<T>::NR/div;

typedef T vx __attribute__((vector_size (32)));

vx P[MR*NR] __attribute__ ((aligned (128))) = {};

const vx *B_ = (vx *)B;

for (Index l=0; l<kc; ++l) {

for (Index i=0; i<MR; ++i) {

for (Index j=0; j<(NR); ++j) {

P[i * NR + j] += A[i]*B_[j];

}

A += MR;

B_ += NR;

}

T *P_ = (T *)P;

for (Index j=0; j<NR*div; ++j) {

for (Index i=0; i<MR; ++i) {

C[i*incRowC+j*incColC] *= beta;

C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];

}

for (Index l=0; l<kc; ++l) {

for (Index i=0; i<MR; ++i) {

for (Index j=0; j<(NR); ++j) {

P[i * NR + j] += A[i]*B_[j];

}

A += MR;

B_ += NR;

}

From my experience, only if this is optimized to the extrem a little bit

extra performance can be achieved by treating some special cases for

the update

T *P_ = (T *)P;

for (Index j=0; j<NR*div; ++j) {

for (Index i=0; i<MR; ++i) {

C[i*incRowC+j*incColC] *= beta;

C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];

}

The main reason my code looks weird is that I also want to use it

for teaching. The way it is coded you easily can map it forth and

back with the assembly code.

new kernel:

template <typename Index, typename T>
typename std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc, T alpha,
               const T *A, const T *B,
     T beta,
     T *C, Index incRowC, Index incColC)
{
   A = (const T*) __builtin_assume_aligned (A, 128);
   B = (const T*) __builtin_assume_aligned (B, 128);
   static const unsigned div = 32/sizeof(T);
   static const Index MR = BlockSize<T>::MR;
   static const Index NR = BlockSize<T>::NR/div;

   typedef T vx __attribute__((vector_size (32)));

   vx P[MR*NR] __attribute__ ((aligned (128))) = {};
   const vx *B_ = (vx *)B;
   for (Index l=0; l<kc; ++l) {
       for (Index i=0; i<MR; ++i) {
           for (Index j=0; j<(NR); ++j) {
               P[i * NR + j] += A[i]*B_[j];
           }
       }
       A += MR;
       B_ += NR;
   }

   T *P_ = (T *)P;
   for (Index j=0; j<NR; ++j) {
       for (Index i=0; i<MR; ++i) {
           C[i*incRowC+j*incColC] *= beta;
           C[i*incRowC+j*incColC] += alpha*P_[i * NR + j];
       }
   }
}