// if have any problem, contact me.
// the asm function nearly have the same speed as mmx!
/////// asm code
unsigned char *clip;
void init_clip()
{
unsigned char *p;
p=(unsigned char*)malloc(2048);
clip=p+1024;
for (int i=-1024;i<1024;i++)
{
clip[i]=(i>=0)? ((i<=255)?i:255):0;
}
}
// r=1.164(y-16)+1.596(v-128)
// g=1.164(y-16)-0.391(u-128)-0.813(v-128)
// b=1.164(y-16)+2.018(u-128)
const static int p_1164 = 75;
const static int p_1596 = 102;
const static int p_0391 = 25;
const static int p_0813 = 52;
const static int p_2018 = 129;
const static int ooffooff=0x00ff00ff;
const static int ffooffoo=0xff00ff00;
const static short p_223[]= {25632,25632};
const static short p_135[]= {4349,4349};
const static short p_277[]= { 23906,23906};
void paroll_yuv2rgb(unsigned char *y,
unsigned char *u,
unsigned char *v,
unsigned char *r,
int h ,
int w)
{
// h: height of y matrix
// w: width of y matrix
// chroma type:: must be 420
// r=1.164*y + 1.596*v -223
// g=1.164*y - 0.391*u - 0.813*v +135.9
// b=1.164*y + 2.018*u -276.93
int py1164_20;
int py1164_31;
int pv1596;
int pv0813;
int pu0391;
int pu2018;
int pr20,pr31,pg20,pg31,pb20,pb31;
int rw=w<<2;
int rws16=rw-16;
int lw=w>>2;
int lh=h>>1;
int lw0=lw;
int iclip=(int)clip;
__asm
{
mov esi,y
llw:
mov edi,v
add [v],2
movzx ebx,byte ptr [edi]
movzx eax,byte ptr [edi+1]
mov edi,u
add [u],2
shl eax,16
or eax,ebx // 00 v1 00 v0
movzx ecx,byte ptr [edi+1]
mov ebx,eax
mul [p_0813]
shl ecx,16
mov [pv0813],eax
mov eax,ebx
mul dword ptr p_1596
movzx ebx,byte ptr [edi]
mov [pv1596],eax
mov eax,ecx
or eax,ebx // 00 u1 00 u0
mov ecx,[esi] // y3 y2 y1 y0
mov ebx,eax
mul dword ptr p_0391
mov edi,ecx
mov [pu0391],eax
mov eax,ebx
mul dword ptr p_2018
and ecx,ooffooff // 0 y2 0 y0
mov [pu2018],eax
mov eax,ecx
mul [p_1164] // y2 y0
and edi,ffooffoo // y3 0 y1 0
mov [py1164_20],eax
mov eax,edi
shr eax,8 // 0 y3 0 y2
mul [p_1164] // y3 y1
mov ecx,[pv1596]
mov ebx,[py1164_20]
mov edx,dword ptr p_223
mov [py1164_31],eax
add eax,ecx
add ebx,ecx
shr eax,1
shr ebx,1
add eax,edx
add ebx,edx
shl eax,1
shl ebx,1
mov [pr31],eax // r3 r1
mov [pr20],ebx // r2 r0
mov ecx,[pu2018]
mov eax,[py1164_20]
mov ebx,[py1164_31]
mov edx,dword ptr [p_277]
add eax,ecx
add ebx,ecx
shr eax,1
shr ebx,1
add eax,edx
add ebx,edx
shl eax,1
shl ebx,1
mov [pb20],eax
mov [pb31],ebx
mov eax,[py1164_20]
mov ebx,[py1164_31]
mov ecx,[pu0391]
mov edx,[pv0813]
shr eax,1
shr ebx,1
shr ecx,1
shr edx,1
sub eax,ecx
sub ebx,ecx
mov ecx,dword ptr [p_135]
sub eax,edx
sub ebx,edx
add eax,ecx
add ebx,ecx
shl eax,1
shl ebx,1
mov [pg20],eax
mov [pg31],ebx
// clip and output
mov edi,r
lea edx, [pr20]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // r0
mov dl,[ebx] // r2
mov [edi+2],cl
mov [edi+10],dl
lea edx,[pr31]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // r1
mov dl,[ebx] // r3
mov [edi+6],cl
mov [edi+14],dl
lea edx,[pg20]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // g0
mov dl,[ebx] // g2
mov [edi+1],cl
mov [edi+9],dl
lea edx,[pg31]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // g1
mov dl,[ebx] // g3
mov [edi+5],cl
mov [edi+13],dl
lea edx,[pb20]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // b0
mov dl,[ebx] // b2
mov [edi],cl
mov [edi+8],dl
lea edx,[pb31]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // b1
mov dl,[ebx] // b3
mov ebx,rw
add esi,w
add [r],ebx
mov eax,[esi] // y3 y2 y1 y0
mov [edi+4],cl
mov [edi+12],dl
// next row of y
mov ebx,eax
and eax,ooffooff // 0 y2 0 y0
mul [p_1164]
and ebx,ffooffoo // y3 0 y1 0
shr ebx,8
mov [py1164_20],eax
mov eax,ebx
mul [p_1164]
mov ecx,pv1596
mov ebx,py1164_20
mov edx,dword ptr p_223
mov [py1164_31],eax
add eax,ecx
add ebx,ecx
shr eax,1
shr ebx,1
add eax,edx
add ebx,edx
shl eax,1
shl ebx,1
mov [pr31],eax // r3 r1
mov [pr20],ebx // r2 r0
mov ecx,[pu2018]
mov eax,[py1164_20]
mov ebx,[py1164_31]
mov edx,dword ptr [p_277]
add eax,ecx
add ebx,ecx
shr eax,1
shr ebx,1
add eax,edx
add ebx,edx
shl eax,1
shl ebx,1
mov [pb20],eax
mov [pb31],ebx
mov ecx,[pu0391]
mov eax,[py1164_20]
mov ebx,[py1164_31]
mov edx,[pv0813]
shr ecx,1
shr eax,1
shr ebx,1
shr edx,1
sub eax,ecx
sub ebx,ecx
mov ecx,dword ptr [p_135]
sub eax,edx
sub ebx,edx
add eax,ecx
add ebx,ecx
shl eax,1
shl ebx,1
mov [pg20],eax
mov [pg31],ebx
// clip and output
mov edi,r
lea edx,[pr20]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // r0
mov dl,[ebx] // r2
mov [edi+2],cl
mov [edi+10],dl
lea edx,[pr31]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // r1
mov dl,[ebx] // r3
mov [edi+6],cl
mov [edi+14],dl
lea edx,[pg20]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // g0
mov dl,[ebx] // g2
mov [edi+1],cl
mov [edi+9],dl
lea edx,[pg31]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // g1
mov dl,[ebx] // g3
mov [edi+5],cl
mov [edi+13],dl
lea edx,[pb20]
mov ecx,iclip
movsx eax,word ptr [edx]
movsx ebx,word ptr [edx+2]
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // b0
mov dl,[ebx] // b2
mov [edi],cl
mov [edi+8],dl
lea edx,[pb31]
mov ecx,iclip
movsx eax,word ptr [edx] file://b1
movsx ebx,word ptr [edx+2] file://b3
sar eax,6
sar ebx,6
add eax,ecx
add ebx,ecx
xor ecx,ecx
xor edx,edx
mov cl,[eax] // b1
mov dl,[ebx] // b3
mov [edi+4],cl
mov [edi+12],dl
mov eax,rws16
sub esi,w
add esi,4
sub [r],eax
sub [lw],1
jnz llw
mov eax,lw0
mov ebx,rw
add esi,w
add [r],ebx
mov [lw],eax
sub [lh],1
jnz llw
}
}
///// asm code end
///// mmx code begin
#ifdef __yuv2rgb_mul32
const static short t16[4]={16,16,16,16};
const static short t128[4]={128,128,128,128};
const short t1164[4]=
{
4768,4768,4768,4768
};
const short t1596[4]=
{
6538,6538,6538,6538
};
const short t0391[4]=
{
1602,1602,1602,1602
};
const short t0813[4]=
{
3330,3330,3330,3330
};
const short t2018[4]=
{
8266,8266,8266,8266
};
// r=1.164(y-16)+1.596(v-128)
// g=1.164(y-16)-0.391(u-128)-0.813(v-128)
// b=1.164(y-16)+2.018(u-128)
#define ___0rgb
void VideoPlayer::yuv2rgb4XmmxC420(unsigned char *lpY,
unsigned char *lpU,
unsigned char *lpV,
unsigned char *lpRGB,
int nSrcHeight ,
int nSrcWidth)
{
int rgbwidth=nSrcWidth<<2;// 32 bits 0rgb;
int nyw=nSrcWidth;
int col=nSrcWidth>>3;
int row=nSrcHeight>>1;
int t1596v_128_10[2];
int t1596v_128_32[2];
int t0813v_128_10[2];
int t0813v_128_32[2];
int t0391u_128_10[2];
int t0391u_128_32[2];
int t2018u_128_10[2];
int t2018u_128_32[2];
__asm
{
mov esi,lpU
mov edi,lpV
mov eax,lpY
mov edx,lpRGB
mov ecx,col
mov ebx,row
rrr: pxor mm0,mm0
movq mm3,qword ptr t128
movd mm2,dword ptr [edi] file://00 00 00 00 v3 v2 v1 v0
movd mm1,dword ptr [esi] file://00 00 00 00 u3 u2 u1 u0
punpcklbw mm2,mm0 file://00 v3 00 v2 00 v1 00 v0
punpcklbw mm1,mm0 file://00 u3 00 u2 00 u1 00 u0
psubsw mm1,mm3 file://u-128
psubsw mm2,mm3 file://v-128
file://compute u,v data
file://t0391u_128
movq mm7,qword ptr t0391
movq mm3,mm1
movq mm4,mm1
pmullw mm4,mm7
pmulhw mm3,mm7
movq mm7,mm4
punpckhwd mm4,mm3 file://t0391u_128_32-->mm4
punpcklwd mm7,mm3 file://t0391u_128_10-->mm7
movq qword ptr t0391u_128_32,mm4
movq qword ptr t0391u_128_10,mm7
file://t2018u_128
movq mm7,qword ptr t2018
movq mm3,mm1
pmullw mm1,mm7
pmulhw mm3,mm7
movq mm7,mm1
punpckhwd mm1,mm3 file://t2018u_128_32-->mm1
punpcklwd mm7,mm3 file://t2018u_128_10-->mm7
movq qword ptr t2018u_128_32,mm1
movq qword ptr t2018u_128_10,mm7
file://t1596v_128
movq mm7,qword ptr t1596
movq mm3,mm2
movq mm4,mm2
pmullw mm4,mm7
pmulhw mm3,mm7
movq mm7,mm4
punpckhwd mm4,mm3 file://t1596v_128_32-->mm4
punpcklwd mm7,mm3 file://t1596v_128_10-->mm7
movq qword ptr t1596v_128_32,mm4
movq qword ptr t1596v_128_10,mm7
file://t0813v_128
movq mm7,qword ptr t0813
movq mm3,mm2
pmullw mm2,mm7
pmulhw mm3,mm7
movq mm7,mm2
punpckhwd mm2,mm3 file://t0813v_128_32-->mm2
punpcklwd mm7,mm3 file://t0813v_128_10-->mm7
movq qword ptr t0813v_128_32,mm2
movq qword ptr t0813v_128_10,mm7
movq mm3,dword ptr [eax] // 76 54 32 10
pxor mm0,mm0
movq mm2,mm3
punpcklbw mm2,mm0 // 03 02 01 00
punpckhbw mm3,mm0 // 07 06 05 04
movq mm4,mm2
movq mm5,mm3
punpcklwd mm2,mm0 // 00 01 00 00
punpckhwd mm0,mm4 // 03 00 02 00
pxor mm4,mm4
por mm0,mm2 // 03 01 02 00--->mm0
movq mm7,qword ptr t16
punpcklwd mm3,mm4 // 00 05 00 04
punpckhwd mm4,mm5 // 07 00 06 00
por mm4,mm3 // 07 05 06 04-->mm5
psubsw mm0,mm7 file://y-16
movq mm5,mm4
psubsw mm5,mm7 file://y-16
file://compute
movq mm7,qword ptr t1164
movq mm6,mm0 file://y3 y1 y2 y0
pmullw mm6,mm7
pmulhw mm0,mm7
movq mm7,mm6
punpckhwd mm7,mm0 // y3 y1 file://1.164(y-16)-->mm7
punpcklwd mm6,mm0 // y2 y0 file://1.164(y-16)-->mm6
movq mm0,qword ptr t1596v_128_10
movq mm1,mm6 // y2 y0
movq mm2,mm7 // y3 y1
// r=1.164(y-16)+1.596(v-128)
paddd mm1,mm0 // r2 r0
paddd mm2,mm0 // r3 r1
psrad mm1,12
psrad mm2,12
movq mm0,mm1
punpckhdq mm1,mm2 // r3 r2
punpckldq mm0,mm2 // r1 r0
packssdw mm0,mm1 // r3 r2 r1 r0 --->mm0
movq mm1,qword ptr t0391u_128_10
movq mm4,qword ptr t0813v_128_10
// g=1.164(y-16)-0.391(u-128)-0.813(v-128)
movq mm2,mm6
movq mm3,mm7
psubd mm2,mm1
psubd mm3,mm1
psubd mm2,mm4
psubd mm3,mm4
psrad mm2,12
psrad mm3,12
movq mm4,mm2
movq mm1,qword ptr t2018u_128_10
punpckhdq mm2,mm3
punpckldq mm4,mm3
packssdw mm4,mm2 // g3 g2 g1 g0 --->mm4
// b=1.164(y-16)+2.018(u-128)
paddd mm6,mm1 // b2 b0
paddd mm7,mm1 // b3 b1
psrad mm6,12
psrad mm7,12
movq mm1,mm6
punpckhdq mm1,mm7
punpckldq mm6,mm7
pxor mm2,mm2
packssdw mm6,mm1 // b3 b2 b1 b0 --->mm6
// b-->mm6,g-->mm4,r-->mm0
#ifdef ___0rgb
packuswb mm6,mm2
packuswb mm4,mm2
packuswb mm0,mm2
punpcklbw mm6,mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm6
punpcklbw mm0,mm2 // 00 r3 00 r2 00 r1 00 r0 -->mm0
movq mm7,mm6
punpcklwd mm6,mm0 // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd mm7,mm0 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx], mm6
movq qword ptr[edx+8], mm7
#else
packuswb mm0,mm2 file://r
packuswb mm4,mm2 file://g
packuswb mm6,mm2 file://b
punpcklbw mm0,mm4 // g3 r3 g2 r2 g1 r1 g0 r0 -->mm0
punpcklbw mm6,mm2 // 00 b3 00 b2 00 b1 00 b0 -->mm6
movq mm7,mm0
punpcklwd mm0,mm6 // 00 b1 g1 r1 00 r0 g0 b0
punpckhwd mm7,mm6 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx], mm0
movq qword ptr[edx+8], mm7
#endif
file://compute
movq mm7,qword ptr t1164
movq mm6,mm5 file://y7 y5 y6 y4
pmullw mm6,mm7
pmulhw mm5,mm7
movq mm7,mm6
punpckhwd mm7,mm5 // y7 y5 file://1.164(y-16)-->mm7
punpcklwd mm6,mm5 // y6 y4 file://1.164(y-16)-->mm6
movq mm0,qword ptr t1596v_128_32
movq mm1,mm6 // y6 y4
movq mm2,mm7 // y7 y5
// r=1.164(y-16)+1.596(v-128)
paddd mm1,mm0 // r2 r0
paddd mm2,mm0 // r3 r1
psrad mm2,12
psrad mm1,12
movq mm0,mm1
punpckhdq mm1,mm2 // r3 r2
punpckldq mm0,mm2 // r1 r0
packssdw mm0,mm1 // r3 r2 r1 r0 --->mm0
movq mm1,qword ptr t0391u_128_32
movq mm4,qword ptr t0813v_128_32
file://g=1.164(y-16)-0.391(u-128)-0.813(v-128)
movq mm2,mm6
movq mm3,mm7
psubd mm2,mm1
psubd mm3,mm1
psubd mm2,mm4
psubd mm3,mm4
psrad mm2,12
psrad mm3,12
movq mm1,qword ptr t2018u_128_32
movq mm4,mm2
punpckhdq mm2,mm3
punpckldq mm4,mm3
packssdw mm4,mm2 // g3 g2 g1 g0 --->mm4
// b=1.164(y-16)+2.018(u-128)
paddd mm6,mm1 // b2 b0
paddd mm7,mm1 // b3 b1
psrad mm6,12
psrad mm7,12
movq mm1,mm6
punpckhdq mm1,mm7
punpckldq mm6,mm7
pxor mm2,mm2
packssdw mm6,mm1 // b3 b2 b1 b0 --->mm6
// b-->mm6,g-->mm4,r-->mm0
#ifdef ___0rgb
packuswb mm6,mm2
packuswb mm4,mm2
punpcklbw mm6,mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm6
packuswb mm0,mm2
punpcklbw mm0,mm2 // 00 r3 00 r2 00 r1 00 r0 -->mm0
movq mm7,mm6
punpcklwd mm6,mm0 // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd mm7,mm0 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx+16], mm6
movq qword ptr[edx+24], mm7
#else
packuswb mm0,mm2 file://r
packuswb mm4,mm2 file://g
packuswb mm6,mm2 file://b
punpcklbw mm0,mm4 // g3 r3 g2 r2 g1 r1 g0 r0 -->mm0
punpcklbw mm6,mm2 // 00 b3 00 b2 00 b1 00 b0 -->mm6
movq mm7,mm0
punpcklwd mm0,mm6 // 00 b1 g1 r1 00 r0 g0 b0
punpckhwd mm7,mm6 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx+16], mm0
movq qword ptr[edx+24], mm7
#endif
/////
file://second stage , next row of y
add eax,nyw
add edx,rgbwidth
movq mm3,dword ptr [eax] // 76 54 32 10
pxor mm0,mm0
movq mm2,mm3
punpcklbw mm2,mm0 // 03 02 01 00
punpckhbw mm3,mm0 // 07 06 05 04
movq mm4,mm2
punpcklwd mm2,mm0 // 00 01 00 00
punpckhwd mm0,mm4 // 03 00 02 00
pxor mm4,mm4
por mm0,mm2 // 03 01 02 00--->mm0
movq mm7,qword ptr t16
movq mm5,mm3
punpcklwd mm3,mm4 // 00 05 00 04
punpckhwd mm4,mm5 // 07 00 06 00
por mm4,mm3 // 07 05 06 04-->mm4
psubsw mm0,mm7 file://y-16
movq mm5,mm4
psubsw mm5,mm7 file://y-16
file://compute
movq mm7,qword ptr t1164
movq mm6,mm0 file://y3 y1 y2 y0
pmullw mm6,mm7
pmulhw mm0,mm7
movq mm7,mm6
punpckhwd mm7,mm0 // y3 y1 file://1.164(y-16)-->mm7
punpcklwd mm6,mm0 // y2 y0 file://1.164(y-16)-->mm6
movq mm0,qword ptr t1596v_128_10
movq mm1,mm6 // y2 y0
movq mm2,mm7 // y3 y1
// r=1.164(y-16)+1.596(v-128)
paddd mm1,mm0 // r2 r0
paddd mm2,mm0 // r3 r1
psrad mm2,12
psrad mm1,12
movq mm0,mm1
punpckhdq mm1,mm2 // r3 r2
punpckldq mm0,mm2 // r1 r0
packssdw mm0,mm1 // r3 r2 r1 r0 --->mm0
movq mm1,qword ptr t0391u_128_10
movq mm4,qword ptr t0813v_128_10
file://g=1.164(y-16)-0.391(u-128)-0.813(v-128)
movq mm2,mm6
movq mm3,mm7
psubd mm2,mm1
psubd mm3,mm1
psubd mm2,mm4
psubd mm3,mm4
psrad mm2,12
psrad mm3,12
movq mm4,mm2
movq mm1,qword ptr t2018u_128_10
punpckhdq mm2,mm3
punpckldq mm4,mm3
packssdw mm4,mm2 // g3 g2 g1 g0 --->mm4
// b=1.164(y-16)+2.018(u-128)
paddd mm6,mm1 // b2 b0
paddd mm7,mm1 // b3 b1
psrad mm6,12
psrad mm7,12
movq mm1,mm6
punpckhdq mm1,mm7
punpckldq mm6,mm7
pxor mm2,mm2
packssdw mm6,mm1 // b3 b2 b1 b0 --->mm6
// b-->mm6,g-->mm4,r-->mm0
#ifdef ___0rgb
packuswb mm6,mm2
packuswb mm4,mm2
punpcklbw mm6,mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm6
packuswb mm0,mm2
punpcklbw mm0,mm2 // 00 r3 00 r2 00 r1 00 r0 -->mm0
movq mm7,mm6
punpcklwd mm6,mm0 // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd mm7,mm0 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx], mm6
movq qword ptr[edx+8], mm7
#else
packuswb mm0,mm2 file://r
packuswb mm4,mm2 file://g
packuswb mm6,mm2 file://b
punpcklbw mm0,mm4 // g3 r3 g2 r2 g1 r1 g0 r0 -->mm0
punpcklbw mm6,mm2 // 00 b3 00 b2 00 b1 00 b0 -->mm6
movq mm7,mm0
punpcklwd mm0,mm6 // 00 b1 g1 r1 00 r0 g0 b0
punpckhwd mm7,mm6 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx], mm0
movq qword ptr[edx+8], mm7
#endif
file://compute
movq mm7,qword ptr t1164
movq mm6,mm5 file://y7 y5 y6 y4
pmullw mm6,mm7
pmulhw mm5,mm7
movq mm7,mm6
punpckhwd mm7,mm5 // y7 y5 file://1.164(y-16)-->mm7
punpcklwd mm6,mm5 // y6 y4 file://1.164(y-16)-->mm6
movq mm0,qword ptr t1596v_128_32
movq mm1,mm6 // y6 y4
movq mm2,mm7 // y7 y5
// r=1.164(y-16)+1.596(v-128)
paddd mm1,mm0 // r2 r0
paddd mm2,mm0 // r3 r1
psrad mm1,12
psrad mm2,12
movq mm0,mm1
punpckhdq mm1,mm2 // r3 r2
punpckldq mm0,mm2 // r1 r0
packssdw mm0,mm1 // r3 r2 r1 r0 --->mm0
movq mm1,qword ptr t0391u_128_32
movq mm4,qword ptr t0813v_128_32
file://g=1.164(y-16)-0.391(u-128)-0.813(v-128)
movq mm2,mm6
movq mm3,mm7
psubd mm2,mm1
psubd mm3,mm1
psubd mm2,mm4
psubd mm3,mm4
psrad mm2,12
psrad mm3,12
movq mm1,qword ptr t2018u_128_32
movq mm4,mm2
punpckhdq mm2,mm3
punpckldq mm4,mm3
packssdw mm4,mm2 // g3 g2 g1 g0 --->mm4
// b=1.164(y-16)+2.018(u-128)
paddd mm6,mm1 // b2 b0
paddd mm7,mm1 // b3 b1
psrad mm6,12
psrad mm7,12
movq mm1,mm6
punpckhdq mm1,mm7
punpckldq mm6,mm7
pxor mm2,mm2
packssdw mm6,mm1 // b3 b2 b1 b0 --->mm6
// b-->mm6,g-->mm4,r-->mm0
#ifdef ___0rgb
packuswb mm6,mm2
packuswb mm4,mm2
punpcklbw mm6,mm4 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm6
packuswb mm0,mm2
punpcklbw mm0,mm2 // 00 r3 00 r2 00 r1 00 r0 -->mm0
movq mm7,mm6
punpcklwd mm6,mm0 // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd mm7,mm0 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx+16], mm6
movq qword ptr[edx+24], mm7
#else
packuswb mm0,mm2 file://r
packuswb mm4,mm2 file://g
packuswb mm6,mm2 file://b
punpcklbw mm0,mm4 // g3 r3 g2 r2 g1 r1 g0 r0 -->mm0
punpcklbw mm6,mm2 // 00 b3 00 b2 00 b1 00 b0 -->mm6
movq mm7,mm0
punpcklwd mm0,mm6 // 00 b1 g1 r1 00 r0 g0 b0
punpckhwd mm7,mm6 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx+16], mm0
movq qword ptr[edx+24], mm7
#endif
sub eax,nyw
sub edx,rgbwidth
add esi,4
add edi,4
add eax,8
add edx,32
dec ecx
jnz rrr
mov ecx,col
add eax,nyw
add edx,rgbwidth
dec ebx
jnz rrr
emms
}
}
#else
short t1164[4]=
{
19071,19071,19071,19071//<<2
};
short t1596[4]=
{
26149,26149,26149,26149//<<2
};
short t0391[4]=
{
25625,25625,25625,25625//<<0
};
short t0813[4]=
{
26641,26641,26641,26641//<<1
};
short t2018[4]=
{
16532,16532,16532,16532//<<3
};
short t16[4]=
{
16,16,16,16
};
short t128[4]=
{
128,128,128,128
};
void VideoPlayer::yuv2rgb4XmmxC420(unsigned char *lpY,
unsigned char *lpU,
unsigned char *lpV,
unsigned char *lpRGB,
int nSrcHeight,
int nSrcWidth)
{
int rgbwidth=nSrcWidth<<2;// 32 bits rgb0;
int nyw=nSrcWidth;
int col=nSrcWidth>>3;
int row=nSrcHeight>>1;
#define mmt2018u mm1
#define mmt0813v mm2
#define mmt0391u mm3
#define mmt1596v mm4
__int64 ty;
__asm
{
mov esi,lpU
mov edi,lpV
mov eax,lpY
mov edx,lpRGB
mov ecx,col
mov ebx,row
rrr:
pxor mm0,mm0
movq mm3,qword ptr t128
movq mm4,qword ptr t0391
movq mm5,qword ptr t2018
movq mm6,qword ptr t1596
movq mm7,qword ptr t0813
movd mm1,dword ptr [esi]
movd mm2,dword ptr [edi]
punpcklbw mm1,mm0
punpcklbw mm2,mm0
file://copute u,v
psubsw mm1,mm3 file://u-128
psubsw mm2,mm3 file://v-128
movq mm3,mm1
psllw mm1,3
pmulhw mm3,mm4 // t0391u-->mm3
pmulhw mm1,mm5 // t2018u-->mm1
movq mm4,mm2
psllw mm2,1
psllw mm4,2
pmulhw mm2,mm7 // t0813v-->mm2
pmulhw mm4,mm6 // t1596v-->mm4
movq mm5,dword ptr [eax] // 76 54 32 10
pxor mm0,mm0
movq mm6,mm5
punpcklbw mm5,mm0 // 03 02 01 00
punpckhbw mm0,mm6 // 70 60 50 40
por mm0,mm5 // 73 62 51 40
pxor mm6,mm6
pxor mm5,mm5
punpckhbw mm6,mm0 // 70 30 60 20
punpcklbw mm0,mm5 // 05 01 04 00
por mm0,mm6 // 75 31 64 20
pxor mm5,mm5
movq mm6,mm0
punpckhbw mm6,mm5 // y7 y5 y3 y1
punpcklbw mm0,mm5
movq mm5,qword ptr t16
movq mm7,qword ptr t1164
psubsw mm6,mm5
psubsw mm0,mm5
psllw mm6,2
psllw mm0,2
pmulhw mm6,mm7
pmulhw mm0,mm7 // y6 y4 y2 y0 -->mm0
movq qword ptr ty,mm6 // y7 y5 y3 y1 -->ty
file://compute
pxor mm7,mm7
movq mm5,mmt1596v
movq mm6,mm0 file://copy 1.164(y-16)
paddsw mm5,mm0 // r=1.164(y-16)+1.596(v-128) r-->mm5
psubsw mm6,mmt0391u file://1.164(y-16)-0.391(u-128)
psubsw mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128) g-->mm6
paddsw mm0,mmt2018u // b=1.164(y-16)+2.018(u-128) b-->mm0
// b-->mm0,g-->mm6,r-->mm5
packuswb mm6,mm7
packuswb mm0,mm7
punpcklbw mm0,mm6 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm0
packuswb mm5,mm7
punpcklbw mm5,mm7 // 00 r3 00 r2 00 r1 00 r0 -->mm5
movq mm7,mm0
punpcklwd mm0,mm5 // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd mm7,mm5 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx], mm0
movq mm0,qword ptr ty
movq qword ptr[edx+8], mm7
file://compute
pxor mm7,mm7
movq mm5,mmt1596v
paddsw mm5,mm0 // r=1.164(y-16)+1.596(v-128) r-->mm5
movq mm6,mm0 file://copy 1.164(y-16)
psubsw mm6,mmt0391u file://1.164(y-16)-0.391(u-128)
psubsw mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128) g-->mm6
paddsw mm0,mmt2018u // b=1.164(y-16)+2.018(u-128) b-->mm0
// b-->mm0,g-->mm6,r-->mm5
packuswb mm6,mm7
packuswb mm0,mm7
punpcklbw mm0,mm6 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm0
packuswb mm5,mm7
punpcklbw mm5,mm7 // 00 r3 00 r2 00 r1 00 r0 -->mm5
movq mm7,mm0
movq mm6,[edx] // 2 0
punpcklwd mm0,mm5 // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd mm7,mm5 // 00 r3 g3 b3 00 r2 g2 b2
movq mm5,mm6
punpckldq mm6,mm0 // 1 0
punpckhdq mm5,mm0 // 3 2
movq mm0,[edx+8] // 4 6
movq [edx],mm6
movq [edx+8], mm5
movq mm6,mm0
punpckhdq mm0,mm7 // 7 6
punpckldq mm6,mm7 // 5 4
movq [edx+24], mm0
movq [edx+16],mm6
file://next row of y
add eax,nyw
add edx,rgbwidth
movq mm5,dword ptr [eax] // 76 54 32 10
pxor mm0,mm0
movq mm6,mm5
punpcklbw mm5,mm0 // 03 02 01 00
punpckhbw mm0,mm6 // 70 60 50 40
por mm0,mm5 // 73 62 51 40
pxor mm6,mm6
pxor mm5,mm5
punpckhbw mm6,mm0 // 70 30 60 20
punpcklbw mm0,mm5 // 05 01 04 00
por mm0,mm6 // 75 31 64 20
pxor mm5,mm5
movq mm6,mm0
punpckhbw mm6,mm5 // y7 y5 y3 y1
punpcklbw mm0,mm5
movq mm5,qword ptr t16
movq mm7,qword ptr t1164
psubsw mm6,mm5
psubsw mm0,mm5
psllw mm6,2
psllw mm0,2
pmulhw mm6,mm7
pmulhw mm0,mm7 // y6 y4 y2 y0 -->mm0
movq qword ptr ty,mm6 // y7 y5 y3 y1 -->ty
file://compute
pxor mm7,mm7
movq mm5,mmt1596v
paddsw mm5,mm0 // r=1.164(y-16)+1.596(v-128) r-->mm5
movq mm6,mm0 file://copy 1.164(y-16)
psubsw mm6,mmt0391u file://1.164(y-16)-0.391(u-128)
psubsw mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128) g-->mm6
paddsw mm0,mmt2018u // b=1.164(y-16)+2.018(u-128) b-->mm0
// b-->mm0,g-->mm6,r-->mm5
packuswb mm6,mm7
packuswb mm0,mm7
punpcklbw mm0,mm6 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm0
packuswb mm5,mm7
punpcklbw mm5,mm7 // 00 r3 00 r2 00 r1 00 r0 -->mm5
movq mm7,mm0
punpcklwd mm0,mm5 // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd mm7,mm5 // 00 r3 g3 b3 00 r2 g2 b2
movq qword ptr[edx], mm0
movq qword ptr[edx+8], mm7
file://compute
movq mm0,qword ptr ty
pxor mm7,mm7
movq mm5,mmt1596v
paddsw mm5,mm0 // r=1.164(y-16)+1.596(v-128) r-->mm5
movq mm6,mm0 file://copy 1.164(y-16)
psubsw mm6,mmt0391u file://1.164(y-16)-0.391(u-128)
psubsw mm6,mmt0813v//g=1.164(y-16)-0.391(u-128)-0.813(v-128) g-->mm6
paddsw mm0,mmt2018u // b=1.164(y-16)+2.018(u-128) b-->mm0
// b-->mm0,g-->mm6,r-->mm5
packuswb mm6,mm7
packuswb mm0,mm7
punpcklbw mm0,mm6 // g3 b3 g2 b2 g1 b1 g0 b0 -->mm0
packuswb mm5,mm7
punpcklbw mm5,mm7 // 00 r3 00 r2 00 r1 00 r0 -->mm5
movq mm7,mm0
movq mm6,[edx] // 2 0
punpcklwd mm0,mm5 // 00 r1 g1 b1 00 r0 g0 b0
punpckhwd mm7,mm5 // 00 r3 g3 b3 00 r2 g2 b2
movq mm5,mm6
punpckldq mm6,mm0 // 1 0
punpckhdq mm5,mm0 // 3 2
movq mm0,[edx+8] // 4 6
movq [edx],mm6
movq [edx+8], mm5
movq mm6,mm0
punpckhdq mm0,mm7 // 7 6
punpckldq mm6,mm7 // 5 4
movq [edx+24], mm0
movq [edx+16],mm6
sub eax,nyw
sub edx,rgbwidth
add esi,4
add edi,4
add eax,8
add edx,32
dec ecx
jnz rrr
mov ecx,col
add eax,nyw
add edx,rgbwidth
dec ebx
jnz rrr
emms
}
}
#endif
本文地址:http://com.8s8s.com/it/it3903.htm