YV12的视频回放

类别:编程语言 点击:0 评论:0 推荐:

这两天等待应聘的offer,闲着无聊,回忆起1年多以前做的东西,mpeg4标准的rtp流,收到后在本地回放
我使用的是ddraw来显示,1开始为了先显示出东西进行调试,并且当时重点在同步以及接受流,所以当时用了yuv2bmp
这个东西;把yv12转换成了bmp,然后1桢1桢贴到窗口dc上。呵呵,效果可想;) 经过了矩阵转换又是draw->dc
后来项目进入优化阶段,决定用ddraw的overlay来显示yv12视频。具体的方法就是参照Dxsdk7里的“蚊子”程序。
把yv12直接传到overlay显示. ;)
蚊子现实的是yuv422,和yv12相比较,在逐行memcpy到pSurf的时候排列不同而已,具体参照yuv标准就行了

(简介ddraw的实用,dxsdk7是ddraw的最后版本了)

yuv2bmp的实现:
void  yuv2rgb_32(uint8_t *puc_y, int stride_y,
                uint8_t *puc_u, uint8_t *puc_v, int stride_uv,
                uint8_t *puc_out, int width_y, int height_y,
        unsigned int _stride_out)
{
/* int x, y;
 int stride_diff = 4 * (_stride_out - width_y);

 if (height_y < 0) {
  // we are flipping our output upside-down
  height_y  = -height_y;
  puc_y     += (height_y   - 1) * stride_y ;
  puc_u     += (height_y/2 - 1) * stride_uv;
  puc_v     += (height_y/2 - 1) * stride_uv;
  stride_y  = -stride_y;
  stride_uv = -stride_uv;
 }

 for (y=0; y<height_y; y++)
 {
  for (x=0; x<width_y; x++)
  {
   signed int _r,_g,_b;
   signed int r, g, b;
   signed int y, u, v;

   y = puc_y[x] +10;//- 16;
   u = puc_u[x>>1]-128;
   v = puc_v[x>>1]-128;

   _r = _R(y,u,v);
   _g = _G(y,u,v);
   _b = _B(y,u,v);

   r = _S(_r);
   g = _S(_g);
   b = _S(_b);

   puc_out[0] = r;
   puc_out[1] = g;
   puc_out[2] = b;
   puc_out[3] = 0;

   puc_out+=4;
  }

  puc_y   += stride_y;
  if (y%2) {
   puc_u   += stride_uv;
   puc_v   += stride_uv;
  }
  puc_out += stride_diff;
 }*/


/////////////  Intel MMX ///////////////

 int y, horiz_count;
 int stride_out = width_y <<2;

 if (height_y < 0) {
  // we are flipping our output upside-down
  height_y  = -height_y;
  puc_y     += (height_y   - 1) * stride_y ;
  puc_u     += ((height_y>>1) - 1) * stride_uv;
  puc_v     += ((height_y>>1) - 1) * stride_uv;
  stride_y  = -stride_y;
  stride_uv = -stride_uv;
 }

 horiz_count = -(width_y >> 3);

 
 for (y=0; y<height_y; y++) {
 
  _asm {
   push eax
   push ebx
   push ecx
   push edx
   push edi

   mov eax, puc_out      
   mov ebx, puc_y      
   mov ecx, puc_u      
   mov edx, puc_v
   mov edi, horiz_count
   
  horiz_loop:

   movd mm2, [ecx]
   pxor mm7, mm7

   movd mm3, [edx]
   punpcklbw mm2, mm7       ; mm2 = __u3__u2__u1__u0

   movq mm0, [ebx]          ; mm0 = y7y6y5y4y3y2y1y0 
   punpcklbw mm3, mm7       ; mm3 = __v3__v2__v1__v0

   movq mm1, mmw_0x00ff     ; mm1 = 00ff00ff00ff00ff

   psubusb mm0, mmb_0x10    ; mm0 -= 16

   psubw mm2, mmw_0x0080    ; mm2 -= 128
   pand mm1, mm0            ; mm1 = __y6__y4__y2__y0

   psubw mm3, mmw_0x0080    ; mm3 -= 128
   psllw mm1, 3             ; mm1 *= 8

   psrlw mm0, 8             ; mm0 = __y7__y5__y3__y1
   psllw mm2, 3             ; mm2 *= 8

   pmulhw mm1, mmw_mult_Y   ; mm1 *= luma coeff
   psllw mm0, 3             ; mm0 *= 8

   psllw mm3, 3             ; mm3 *= 8
   movq mm5, mm3            ; mm5 = mm3 = v

   pmulhw mm5, mmw_mult_V_R ; mm5 = red chroma
   movq mm4, mm2            ; mm4 = mm2 = u

   pmulhw mm0, mmw_mult_Y   ; mm0 *= luma coeff
   movq mm7, mm1            ; even luma part

   pmulhw mm2, mmw_mult_U_G ; mm2 *= u green coeff
   paddsw mm7, mm5          ; mm7 = luma + chroma    __r6__r4__r2__r0

   pmulhw mm3, mmw_mult_V_G ; mm3 *= v green coeff 
   packuswb mm7, mm7        ; mm7 = r6r4r2r0r6r4r2r0

   pmulhw mm4, mmw_mult_U_B ; mm4 = blue chroma
   paddsw mm5, mm0          ; mm5 = luma + chroma    __r7__r5__r3__r1

   packuswb mm5, mm5        ; mm6 = r7r5r3r1r7r5r3r1
   paddsw mm2, mm3          ; mm2 = green chroma

   movq mm3, mm1            ; mm3 = __y6__y4__y2__y0
   movq mm6, mm1            ; mm6 = __y6__y4__y2__y0

   paddsw mm3, mm4          ; mm3 = luma + chroma    __b6__b4__b2__b0
   paddsw mm6, mm2          ; mm6 = luma + chroma    __g6__g4__g2__g0
   
   punpcklbw mm7, mm5       ; mm7 = r7r6r5r4r3r2r1r0
   paddsw mm2, mm0          ; odd luma part plus chroma part    __g7__g5__g3__g1

   packuswb mm6, mm6        ; mm2 = g6g4g2g0g6g4g2g0
   packuswb mm2, mm2        ; mm2 = g7g5g3g1g7g5g3g1

   packuswb mm3, mm3        ; mm3 = b6b4b2b0b6b4b2b0
   paddsw mm4, mm0          ; odd luma part plus chroma part    __b7__b5__b3__b1

   packuswb mm4, mm4        ; mm4 = b7b5b3b1b7b5b3b1
   punpcklbw mm6, mm2       ; mm6 = g7g6g5g4g3g2g1g0

   punpcklbw mm3, mm4       ; mm3 = b7b6b5b4b3b2b1b0

   // 32-bit shuffle....
   pxor mm0, mm0            ; is this needed?

   movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0
   punpcklbw mm1, mm0       ; mm1 = __g3__g2__g1__g0

   movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0
   punpcklbw mm0, mm7       ; mm0 = r3b3r2b2r1b1r0b0

   movq mm2, mm0            ; mm2 = r3b3r2b2r1b1r0b0

   punpcklbw mm0, mm1       ; mm0 = __r1g1b1__r0g0b0
   punpckhbw mm2, mm1       ; mm2 = __r3g3b3__r2g2b2

   // 32-bit save...
   movq  [eax], mm0         ; eax[0] = __r1g1b1__r0g0b0
   movq mm1, mm6            ; mm1 = g7g6g5g4g3g2g1g0

   movq 8[eax], mm2         ; eax[8] = __r3g3b3__r2g2b2

   // 32-bit shuffle....
   pxor mm0, mm0            ; is this needed?

   punpckhbw mm1, mm0       ; mm1 = __g7__g6__g5__g4

   movq mm0, mm3            ; mm0 = b7b6b5b4b3b2b1b0
   punpckhbw mm0, mm7       ; mm0 = r7b7r6b6r5b5r4b4

   movq mm2, mm0            ; mm2 = r7b7r6b6r5b5r4b4

   punpcklbw mm0, mm1       ; mm0 = __r5g5b5__r4g4b4
   punpckhbw mm2, mm1       ; mm2 = __r7g7b7__r6g6b6

   //32-bit save...
   add ebx, 8               ; puc_y   += 8;
   add ecx, 4               ; puc_u   += 4;

   movq 16[eax], mm0        ; eax[16] = __r5g5b5__r4g4b4
   add edx, 4               ; puc_v   += 4;

   movq 24[eax], mm2        ; eax[24] = __r7g7b7__r6g6b6
   
   // 0 1 2 3 4 5 6 7 rgb save order

   add eax, 32              ; puc_out += 32

   inc edi
   jne horiz_loop   

   pop edi
   pop edx
   pop ecx
   pop ebx
   pop eax

   emms
      
  }


  puc_y   += stride_y;
  if (y&0x01){//%2) {
   puc_u   += stride_uv;
   puc_v   += stride_uv;
  }
  puc_out += stride_out;
 }
}

本文地址:http://com.8s8s.com/it/it26775.htm