diff vp8.c @ 12238:1a7903913e9b libavcodec

VP8: 30% faster idct_mb Take shortcuts based on statically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks?
author darkshikari
date Fri, 23 Jul 2010 02:58:27 +0000
parents f0c4dc49c8f1
children e6ade5e849c9
line wrap: on
line diff
--- a/vp8.c	Fri Jul 23 01:59:56 2010 +0000
+++ b/vp8.c	Fri Jul 23 02:58:27 2010 +0000
@@ -1186,45 +1186,49 @@
     }
 }
 
-static void idct_mb(VP8Context *s, uint8_t *y_dst, uint8_t *u_dst, uint8_t *v_dst,
-                    VP8Macroblock *mb)
+static void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
 {
-    int x, y, nnz;
+    int x, y, ch, nnz;
 
-    if (mb->mode != MODE_I4x4)
+    if (mb->mode != MODE_I4x4) {
+        uint8_t *y_dst = dst[0];
         for (y = 0; y < 4; y++) {
-            for (x = 0; x < 4; x++) {
-                nnz = s->non_zero_count_cache[y][x];
-                if (nnz) {
-                    if (nnz == 1)
-                        s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
-                    else
-                        s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+            uint32_t nnz = AV_RN32A(s->non_zero_count_cache[y]);
+            if (nnz) {
+                if (nnz&~0x01010101) {
+                    for (x = 0; x < 4; x++) {
+                        nnz = s->non_zero_count_cache[y][x];
+                        if (nnz) {
+                            if (nnz == 1)
+                                s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
+                            else
+                                s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
+                        }
+                    }
+                } else {
+                    s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize);
                 }
             }
             y_dst += 4*s->linesize;
         }
+    }
 
-    for (y = 0; y < 2; y++) {
-        for (x = 0; x < 2; x++) {
-            nnz = s->non_zero_count_cache[4][(y<<1)+x];
-            if (nnz) {
-                if (nnz == 1)
-                    s->vp8dsp.vp8_idct_dc_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
-                else
-                    s->vp8dsp.vp8_idct_add(u_dst+4*x, s->block[4][(y<<1)+x], s->uvlinesize);
-            }
-
-            nnz = s->non_zero_count_cache[5][(y<<1)+x];
-            if (nnz) {
-                if (nnz == 1)
-                    s->vp8dsp.vp8_idct_dc_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
-                else
-                    s->vp8dsp.vp8_idct_add(v_dst+4*x, s->block[5][(y<<1)+x], s->uvlinesize);
+    for (ch = 0; ch < 2; ch++) {
+        if (AV_RN32A(s->non_zero_count_cache[4+ch])) {
+            uint8_t *ch_dst = dst[1+ch];
+            for (y = 0; y < 2; y++) {
+                for (x = 0; x < 2; x++) {
+                    nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
+                    if (nnz) {
+                        if (nnz == 1)
+                            s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+                        else
+                            s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+                    }
+                }
+                ch_dst += 4*s->uvlinesize;
             }
         }
-        u_dst += 4*s->uvlinesize;
-        v_dst += 4*s->uvlinesize;
     }
 }
 
@@ -1511,7 +1515,7 @@
             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
 
             if (!mb->skip) {
-                idct_mb(s, dst[0], dst[1], dst[2], mb);
+                idct_mb(s, dst, mb);
             } else {
                 AV_ZERO64(s->left_nnz);
                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned