(So the pages are re-done …)
The bug is because NR is BlockSize<T>::NR divided by the vector length. So this
should work
//---------------------------------------------------------------------------------
template <typename Index, typename T>
typename std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc, T alpha,
const T *A, const T *B,
T beta,
T *C, Index incRowC, Index incColC)
{
A = (const T*) __builtin_assume_aligned (A, 128);
B = (const T*) __builtin_assume_aligned (B, 128);
static const unsigned div = 32/sizeof(T);
static const Index MR = BlockSize<T>::MR;
static const Index NR = BlockSize<T>::NR/div;
typedef T vx __attribute__((vector_size (32)));
vx P[MR*NR] __attribute__ ((aligned (128))) = {};
const vx *B_ = (vx *)B;
for (Index l=0; l<kc; ++l) {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<(NR); ++j) {
P[i * NR + j] += A[i]*B_[j];
}
}
A += MR;
B_ += NR;
}
T *P_ = (T *)P;
for (Index j=0; j<NR*div; ++j) {
for (Index i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];
}
}
}
//---------------------------------------------------------------------------------
However, as Karl pointed out it also should treat the case beta==0
as a special case.
But I don’t get any speedup. For the performance relevant is
mainly the loop
for (Index l=0; l<kc; ++l) {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<(NR); ++j) {
P[i * NR + j] += A[i]*B_[j];
}
}
A += MR;
B_ += NR;
}
From my experience, only if this is optimized to the extrem a little bit
extra performance can be achieved by treating some special cases for
the update
T *P_ = (T *)P;
for (Index j=0; j<NR*div; ++j) {
for (Index i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += alpha*P_[i * NR * div + j];
}
}
The main reason my code looks weird is that I also want to use it
for teaching. The way it is coded you easily can map it forth and
back with the assembly code.
new kernel:
template <typename Index, typename T>
typename std::enable_if<std::is_floating_point<T>::value,
void>::type
ugemm(Index kc, T alpha,
const T *A, const T *B,
T beta,
T *C, Index incRowC, Index incColC)
{
A = (const T*) __builtin_assume_aligned (A, 128);
B = (const T*) __builtin_assume_aligned (B, 128);
static const unsigned div = 32/sizeof(T);
static const Index MR = BlockSize<T>::MR;
static const Index NR = BlockSize<T>::NR/div;
typedef T vx __attribute__((vector_size (32)));
vx P[MR*NR] __attribute__ ((aligned (128))) = {};
const vx *B_ = (vx *)B;
for (Index l=0; l<kc; ++l) {
for (Index i=0; i<MR; ++i) {
for (Index j=0; j<(NR); ++j) {
P[i * NR + j] += A[i]*B_[j];
}
}
A += MR;
B_ += NR;
}
T *P_ = (T *)P;
for (Index j=0; j<NR; ++j) {
for (Index i=0; i<MR; ++i) {
C[i*incRowC+j*incColC] *= beta;
C[i*incRowC+j*incColC] += alpha*P_[i * NR + j];
}
}
}