#include <stdio .h>
// A matrix
short s1[16] = {
1, 2, 3, 4,
5, 6, 7, 8,
9, 10, 11, 12,
13, 14, 15, 16,
};
// Transpose(B matrix)
short s2[16] = {
17, 21, 25, 29,
18, 22, 26, 30,
19, 23, 27, 31,
20, 24, 28, 32
};
// Destination matrix
short d[16];
int j, i;
int main( int argc, char** argv ){
__asm__("movq (s1), %mm0" );
__asm__("movq %mm0, %mm1" );
__asm__("movq %mm0, %mm2" );
__asm__("punpckhdq %mm2, %mm0" );
__asm__("punpckldq %mm2, %mm1" );
__asm__("movq %mm0, %mm6");
__asm__("movq %mm1, %mm7");
__asm__("movq (s2), %mm2" );
__asm__("mov $1, %eax" );
__asm__("movq s2(,%eax,8), %mm4");
__asm__("movq %mm2, %mm3" );
__asm__("punpckhdq %mm4, %mm2");
__asm__("punpckldq %mm4, %mm3");
__asm__("pmaddwd %mm2, %mm0");
__asm__("pmaddwd %mm3, %mm1");
__asm__("paddw %mm1, %mm0");
__asm__("movq %mm6, %mm1");
__asm__("movq %mm7, %mm2");
__asm__("mov $2, %eax" );
__asm__("movq s2(,%eax,8), %mm3" );
__asm__("mov $3, %eax" );
__asm__("movq s2(,%eax,8), %mm5");
__asm__("movq %mm3, %mm4" );
__asm__("punpckhdq %mm5, %mm3");
__asm__("punpckldq %mm5, %mm4");
__asm__("pmaddwd %mm3, %mm1");
__asm__("pmaddwd %mm4, %mm2");
__asm__("paddw %mm2, %mm1");
__asm__("packssdw %mm1, %mm0");
__asm__("movq %mm0, (d)");
for( j = 0 ; j < 4 ; j++ ){
for( i = 0 ; i < 4 ; i++ ){
fprintf( stderr, "\t%3d", d[j*4+i] );
}
fprintf( stderr, "\n" );
}
return 0;
}