#include
#define VECTOR_SIZE 4
typedef float v4sf __attribute__ ((vector_size(sizeof(float)*VECTOR_SIZE))); // vector of four singl
e floats
typedef union f4vector
{
v4sf v;
float f[VECTOR_SIZE];
} f4vector;
void print_vector (f4vector *v)
{
printf("%f,%f,%f,%f\n", v->f[0], v->f[1], v->f[2], v->f[3]);
}
int main()
{
union f4vector a, b, c;
a.v = (v4sf){1., 2., 3., 4.};
b.v = (v4sf){5., 6., 7., 8.};
c.v = a.v + b.v;
print_vector(&a);
print_vector(&b);
print_vector(&c);
}
Compile with the following command:
gcc -ggdb -mtune=pentium3 -march=pentium3 -c -O3 -ffast-math -mfpmath=sse -msse5 sse.c
To test, just link the object code to binary:
gcc -lm sse.o -o sse
$ ./sse
1.000000,2.000000,3.000000,4.000000
5.000000,6.000000,7.000000,8.000000
6.000000,8.000000,10.000000,12.000000
The assembled code:
$ objdump -dS ./sse.o | grep -2 c.v | tail -8
7c: 0f 58 c1 addps %xmm1,%xmm0
7f: 0f 29 45 c8 movaps %xmm0,-0x38(%ebp)
--
120: f2 0f 11 44 24 04 movsd %xmm0,0x4(%esp)
126: e8 00 00 00 00 call 12b <_main+0xdb>
c.v = a.v + b.v;
print_vector(&a);
As we can see, it's very optimized where adding 4 components of vector a and b is done in one SSE instruction (addps) instead of multiple instructions if we don't use -msse and -mfpmath=sse
How fast is the program?
$ time ./sse
1.000000,2.000000,3.000000,4.000000
5.000000,6.000000,7.000000,8.000000
6.000000,8.000000,10.000000,12.000000
real 0m0.109s
user 0m0.046s
sys 0m0.030s
No comments:
Post a Comment