Mercurial > libavcodec.hg

--- a/vp8.c	Mon Aug 02 09:44:53 2010 +0000
+++ b/vp8.c	Mon Aug 02 20:18:09 2010 +0000
@@ -117,6 +117,7 @@
      */
     DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
     DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
+    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
     uint8_t intra4x4_pred_mode_mb[16];

     int chroma_pred_mode;    ///< 8x8c pred mode of the current macroblock
@@ -864,22 +865,19 @@
 void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 {
-    LOCAL_ALIGNED_16(DCTELEM, dc,[16]);
     int i, x, y, luma_start = 0, luma_ctx = 3;
     int nnz_pred, nnz, nnz_total = 0;
     int segment = s->segment;

     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
-        AV_ZERO128(dc);
-        AV_ZERO128(dc+8);
         nnz_pred = t_nnz[8] + l_nnz[8];

         // decode DC values and do hadamard
-        nnz = decode_block_coeffs(c, dc, s->prob->token[1], 0, nnz_pred,
+        nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
                                   s->qmat[segment].luma_dc_qmul);
         l_nnz[8] = t_nnz[8] = !!nnz;
         nnz_total += nnz;
-        s->vp8dsp.vp8_luma_dc_wht(s->block, dc);
+        s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
         luma_start = 1;
         luma_ctx = 0;
     }
--- a/vp8dsp.c	Mon Aug 02 09:44:53 2010 +0000
+++ b/vp8dsp.c	Mon Aug 02 20:18:09 2010 +0000
@@ -46,6 +46,10 @@
         t1 = dc[i*4+1] + dc[i*4+2];
         t2 = dc[i*4+1] - dc[i*4+2];
         t3 = dc[i*4+0] - dc[i*4+3] + 3; // rounding
+        dc[i*4+0] = 0;
+        dc[i*4+1] = 0;
+        dc[i*4+2] = 0;
+        dc[i*4+3] = 0;

         *block[i][0] = (t0 + t1) >> 3;
         *block[i][1] = (t3 + t2) >> 3;
--- a/x86/vp8dsp-init.c	Mon Aug 02 09:44:53 2010 +0000
+++ b/x86/vp8dsp-init.c	Mon Aug 02 20:18:09 2010 +0000
@@ -224,6 +224,7 @@
 extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
 extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride);
 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
+extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);

@@ -335,6 +336,7 @@

     if (mm_flags & FF_MM_SSE) {
         c->vp8_idct_add                         = ff_vp8_idct_add_sse;
+        c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
         c->put_vp8_epel_pixels_tab[0][0][0]     =
         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
     }
--- a/x86/vp8dsp.asm	Mon Aug 02 09:44:53 2010 +0000
+++ b/x86/vp8dsp.asm	Mon Aug 02 20:18:09 2010 +0000
@@ -1186,12 +1186,23 @@
     SWAP %1, %4, %3
 %endmacro

-INIT_MMX
-cglobal vp8_luma_dc_wht_mmx, 2,3
+%macro VP8_DC_WHT 1
+cglobal vp8_luma_dc_wht_%1, 2,3
     movq          m0, [r1]
     movq          m1, [r1+8]
     movq          m2, [r1+16]
     movq          m3, [r1+24]
+%ifidn %1, sse
+    xorps      xmm0, xmm0
+    movaps  [r1+ 0], xmm0
+    movaps  [r1+16], xmm0
+%else
+    pxor         m4, m4
+    movq    [r1+ 0], m4
+    movq    [r1+ 8], m4
+    movq    [r1+16], m4
+    movq    [r1+24], m4
+%endif
     HADAMARD4_1D  0, 1, 2, 3
     TRANSPOSE4x4W 0, 1, 2, 3, 4
     paddw         m0, [pw_3]
@@ -1203,6 +1214,11 @@
     SCATTER_WHT   0, 1, 0
     SCATTER_WHT   2, 3, 2
     RET
+%endmacro
+
+INIT_MMX
+VP8_DC_WHT mmx
+VP8_DC_WHT sse

 ;-----------------------------------------------------------------------------
 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);