1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
|
#define __x86_64__
//the silly define is only necessary when using MinGW, gcc 4.4 can do without
#include <mmintrin.h>
#undef __x86_64__
...
for (int y=top;y<bottom;y++)
{
uint64_t* ll_screen=...;
uint64_t* ll_bitmap=...;
uint64_t* ll_alphamap=...;
const __m64 zero=_mm_set_pi64x(0);
const __m64 thfs=_mm_set_pi16(0,256,256,256);
const uint64_t tc64=static_cast<uint64_t(transparentColor)<<32|static_cast<uint64_t>(transparentColor);
uint64_t cpix;
for (int x=left;x<right;x+=2)
{
cpix=*ll_bitmap++;
if (cpix!=tc64)
{
__m64 bmpalpha=_mm_set_pi64x(*(ll_alphamap++));
bmpalpha=_mm_unpacklo_pi8(bmpalpha,zero);
__m64 scralpha=_mm_sub_pi16(thfs,bmpalpha);
__m64 src=_mm_set_pi64x(cpix);
__m64 dest=_mm_set_pi64x(*ll_screen);
__m64 px1=_mm_unpacklo_pi8(src,zero);
__m64 px2=_mm_unpackhi_pi8(src,zero);
px1=_mm_mullo_pi16(px1,bmpalpha);
px2=_mm_mullo_pi16(px2,bmpalpha);
__m64 dpx1=_mm_unpacklo_pi8(dest,zero);
__m64 dpx2=_mm_unpackhi_pi8(dest,zero);
dpx1=_mm_mullo_pi16(dpx1,scralpha);
dpx2=_mm_mullo_pi16(dpx2,scralpha);
px1=_mm_add_pi16(px1,dpx1);
px2=_mm_add_pi16(px2,dpx2);
px1=_mm_srli_pi16(px1,8);
px2=_mm_srli_pi16(px2,8);
__m64 res=_mm_packs_pu16(px1,px2);
*(ll_screen++)=_mm_cvtsi64_si64x(res);
}
else ++ll_screen,++ll_alphamap;
}
}
|