multiply – 내 맘대로 보는 세상

diagram for multiplying matrix with mmx
대강 생각을 해보니 정말 mmx 를 이용해서 빠르게 연산을 하려면 위와 같이 하는게 가장 빠르겠군요. 다만 레지스터를 많이 쓰고 완전히 asm 코딩을 해야한다는 게 조금 귀찮겠군요. 😉
위의 다이아그램에 있는 과정을 통해 4×4 matrix * 4×4 matrix 의 한 row 씩을 계산해낼 수 있습니다. 대강 계산했을 때 3배 이상의 속도 향상이 있을거라고 예상되던데 과연~

#include <stdio .h>
// A matrix
short s1[16] = {
     1,  2,  3,  4,
     5,  6,  7,  8,
     9, 10, 11, 12,
    13, 14, 15, 16,
};
// Transpose(B matrix)
short s2[16] = {
    17, 21, 25, 29,
    18, 22, 26, 30,
    19, 23, 27, 31,
    20, 24, 28, 32
};
// Destination matrix
short d[16];
int j, i;
int main( int argc, char** argv ){
    __asm__("movq (s1), %mm0" );
    __asm__("movq %mm0, %mm1" );
    __asm__("movq %mm0, %mm2" );
    __asm__("punpckhdq %mm2, %mm0" );
    __asm__("punpckldq %mm2, %mm1" );
    __asm__("movq %mm0, %mm6");
    __asm__("movq %mm1, %mm7");
    __asm__("movq (s2), %mm2" );
    __asm__("mov $1, %eax" );
    __asm__("movq s2(,%eax,8), %mm4");
    __asm__("movq %mm2, %mm3" );
    __asm__("punpckhdq %mm4, %mm2");
    __asm__("punpckldq %mm4, %mm3");
    __asm__("pmaddwd %mm2, %mm0");
    __asm__("pmaddwd %mm3, %mm1");
    __asm__("paddw  %mm1, %mm0");
    __asm__("movq %mm6, %mm1");
    __asm__("movq %mm7, %mm2");
    __asm__("mov $2, %eax" );
    __asm__("movq s2(,%eax,8), %mm3" );
    __asm__("mov $3, %eax" );
    __asm__("movq s2(,%eax,8), %mm5");
    __asm__("movq %mm3, %mm4" );
    __asm__("punpckhdq %mm5, %mm3");
    __asm__("punpckldq %mm5, %mm4");
    __asm__("pmaddwd %mm3, %mm1");
    __asm__("pmaddwd %mm4, %mm2");
    __asm__("paddw %mm2, %mm1");
    __asm__("packssdw %mm1, %mm0");
    __asm__("movq %mm0, (d)");
    for( j = 0 ; j < 4 ; j++ ){
        for( i = 0 ; i < 4 ; i++ ){
            fprintf( stderr, "\t%3d", d[j*4+i] );
        }
        fprintf( stderr, "\n" );
    }
    return 0;
}

#include <stdio .h>

// A matrix

short s1[16] = {

1, 2, 3, 4,

5, 6, 7, 8,

9, 10, 11, 12,

13, 14, 15, 16,

};

// Transpose(B matrix)

short s2[16] = {

17, 21, 25, 29,

18, 22, 26, 30,

19, 23, 27, 31,

20, 24, 28, 32

};

// Destination matrix

short d[16];

int j, i;

int main( int argc, char** argv ){

__asm__("movq (s1), %mm0" );

__asm__("movq %mm0, %mm1" );

__asm__("movq %mm0, %mm2" );

__asm__("punpckhdq %mm2, %mm0" );

__asm__("punpckldq %mm2, %mm1" );

__asm__("movq %mm0, %mm6");

__asm__("movq %mm1, %mm7");

__asm__("movq (s2), %mm2" );

__asm__("mov $1, %eax" );

__asm__("movq s2(,%eax,8), %mm4");

__asm__("movq %mm2, %mm3" );

__asm__("punpckhdq %mm4, %mm2");

__asm__("punpckldq %mm4, %mm3");

__asm__("pmaddwd %mm2, %mm0");

__asm__("pmaddwd %mm3, %mm1");

__asm__("paddw %mm1, %mm0");

__asm__("movq %mm6, %mm1");

__asm__("movq %mm7, %mm2");

__asm__("mov $2, %eax" );

__asm__("movq s2(,%eax,8), %mm3" );

__asm__("mov $3, %eax" );

__asm__("movq s2(,%eax,8), %mm5");

__asm__("movq %mm3, %mm4" );

__asm__("punpckhdq %mm5, %mm3");

__asm__("punpckldq %mm5, %mm4");

__asm__("pmaddwd %mm3, %mm1");

__asm__("pmaddwd %mm4, %mm2");

__asm__("paddw %mm2, %mm1");

__asm__("packssdw %mm1, %mm0");

__asm__("movq %mm0, (d)");

for( j = 0 ; j < 4 ; j++ ){

for( i = 0 ; i < 4 ; i++ ){

fprintf( stderr, "\t%3d", d[j*4+i] );

}

fprintf( stderr, "\n" );

}

return 0;

}

코드로 옮기니 위와 같군요. 중간에 실수로 바이트오더를 헷갈려서 연산 결과가 뒤집혔었습니다. 정상적인 결과는 250 260 270 280 이 나와야 하는데 280 270 260 250 이 나와버리더군요. 아아 이거 다시 하고 싶은 작업이 아니네요;
흐흣 그래도 오랫만에 어셈블리 관련된 것들을 생각하고 있는데, 이것도 가끔 하니까 재밌네요. 근데 길어지면 할만하지 않다는거 -_-!
p.s) 전체 연산 코드를 보고 싶으시면 http://mytears.org/resources/mysrc/c/mmx.c 를 보시길 😉

몇 일 전에 썼던 글에서 테스트를 해본 내용을 바탕으로 4×4 matrix multiply 연산을 mmx 를 이용해서 구현해봤습니다.

#include <stdio .h>
// A matrix
short s1[16] = {
     1,  2,  3,  4,
     5,  6,  7,  8,
     9, 10, 11, 12,
    13, 14, 15, 16,
};
// Transpose(B matrix)
short s2[16] = {
    17, 21, 25, 29,
    18, 22, 26, 30,
    19, 23, 27, 31,
    20, 24, 28, 32
};
// Destination matrix
short d[16];
short t[4];
int i, j;
long start, end;
int main( int argc, char** argv ){
    int k;
    for( j = 0 ; j < 4 ; j++ ){
        for( i = 0 ; i < 4 ; i++ ){
            d[j*4+i] = 0;
            for( k = 0 ; k < 4 ; k++ ){
                d[j*4+i] += s1[j*4+k] * s2[i*4+k];
            }
        }
    }
    fprintf( stderr, "c version\n\n" );
    for( j = 0 ; j < 4 ; j++ ){
        for( i = 0 ; i < 4 ; i++ ){
            fprintf( stderr, "\t%3d", d[j*4+i] );
        }
        fprintf( stderr, "\n" );
    }
    return 0;
}

#include <stdio .h>

// A matrix

short s1[16] = {

1, 2, 3, 4,

5, 6, 7, 8,

9, 10, 11, 12,

13, 14, 15, 16,

};

// Transpose(B matrix)

short s2[16] = {

17, 21, 25, 29,

18, 22, 26, 30,

19, 23, 27, 31,

20, 24, 28, 32

};

// Destination matrix

short d[16];

short t[4];

int i, j;

long start, end;

int main( int argc, char** argv ){

int k;

for( j = 0 ; j < 4 ; j++ ){

for( i = 0 ; i < 4 ; i++ ){

d[j*4+i] = 0;

for( k = 0 ; k < 4 ; k++ ){

d[j*4+i] += s1[j*4+k] * s2[i*4+k];

}

fprintf( stderr, "c version\n\n" );

for( j = 0 ; j < 4 ; j++ ){

for( i = 0 ; i < 4 ; i++ ){

fprintf( stderr, "\t%3d", d[j*4+i] );

}

fprintf( stderr, "\n" );

}

return 0;

}

위와 같은 c version 의 코드를 작성한 후 아래와 같은 asm version 으로 컨버팅을 해봤는데, 100000 번 반복해서 연산을 하도록 해본 결과 mmx 버젼이 c 버젼보다 3배 정도 빠르게 연산을 하는 것을 확인할 수 있었습니다. (-O0 옵션과 함께 컴파일 했을 경우)
하지만 -O3 옵션과 함께 컴파일하게 되면 asm 버젼은 무한룹에 빠진 듯한 모습을 보여줬고, c 버젼의 수행속도가 -O0 로 컴파일한 asm 버젼보다 빠른 현상이 발생했습니다. 이유는 알 수 없음 -_-;

#include <stdio.h>
#include <asm /mmx.h>
// A matrix
short s1[16] = {
     1,  2,  3,  4,
     5,  6,  7,  8,
     9, 10, 11, 12,
    13, 14, 15, 16,
};
// Transpose(B matrix)
short s2[16] = {
    17, 21, 25, 29,
    18, 22, 26, 30,
    19, 23, 27, 31,
    20, 24, 28, 32
};
// Destination matrix
short d[16];
short t[4];
int i, j;
int main( int argc, char** argv ){
    int loop;
    for( loop = 0 ; loop < 10000; loop++ ){
        for( j = 0 ; j < 4 ; j++ ){
            for( i = 0 ; i < 4 ; i++ ){
                __asm__("mov j, %eax");
                __asm__("movq s1(,%eax,8), %mm0" );
                __asm__("mov i, %eax");
                __asm__("movq s2(,%eax,8), %mm1" );
                __asm__("pmullw %mm1, %mm0");
                __asm__("movq %mm0, (t)" );
                d[j*4+i] = t[0] + t[1] + t[2] + t[3];
            }
        }
    }
    for( j = 0 ; j < 4 ; j++ ){
        for( i = 0 ; i < 4 ; i++ ){
            fprintf( stderr, "\t%3d", d[j*4+i] );
        }
        fprintf( stderr, "\n" );
    }
    return 0;
}

#include <stdio.h>

#include <asm /mmx.h>

// A matrix

short s1[16] = {

1, 2, 3, 4,

5, 6, 7, 8,

9, 10, 11, 12,

13, 14, 15, 16,

};

// Transpose(B matrix)

short s2[16] = {

17, 21, 25, 29,

18, 22, 26, 30,

19, 23, 27, 31,

20, 24, 28, 32

};

// Destination matrix

short d[16];

short t[4];

int i, j;

int main( int argc, char** argv ){

int loop;

for( loop = 0 ; loop < 10000; loop++ ){

for( j = 0 ; j < 4 ; j++ ){

for( i = 0 ; i < 4 ; i++ ){

__asm__("mov j, %eax");

__asm__("movq s1(,%eax,8), %mm0" );

__asm__("mov i, %eax");

__asm__("movq s2(,%eax,8), %mm1" );

__asm__("pmullw %mm1, %mm0");

__asm__("movq %mm0, (t)" );

d[j*4+i] = t[0] + t[1] + t[2] + t[3];

}

for( j = 0 ; j < 4 ; j++ ){

for( i = 0 ; i < 4 ; i++ ){

fprintf( stderr, "\t%3d", d[j*4+i] );

}

fprintf( stderr, "\n" );

}

return 0;

}

8×8 matrix 는 뭔가 좀 더 생각해야할 것 같으니 나중에 정말 필요한 일 있을 때 구현을 해봐야겠습니다. -_-;
inline asm 작업을 하면서 eax 레지스터 값을 백업하지 않고 저렇게 사용해도 되는지는 잘 모르겠지만 –;; 하여튼 저 코드에 한해서는 별 문제 없으니 패스~ 꺄홋!!

Tag: multiply

matrix multiply with mmx #2

matrix multiply with mmx #1