diff vp8.c @ 12241:c7f6ddcc5c01 libavcodec

VP8: optimize DC-only chroma case in the same way as luma. Add MMX idct_dc_add4uv function for this case. ~40% faster chroma idct.
author darkshikari
date Fri, 23 Jul 2010 06:02:52 +0000
parents e6ade5e849c9
children a2f6d8c61b9c
line wrap: on
line diff
--- a/vp8.c	Fri Jul 23 03:44:37 2010 +0000
+++ b/vp8.c	Fri Jul 23 06:02:52 2010 +0000
@@ -1206,7 +1206,7 @@
                         }
                     }
                 } else {
-                    s->vp8dsp.vp8_idct_dc_add4(y_dst, s->block[y], s->linesize);
+                    s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
                 }
             }
             y_dst += 4*s->linesize;
@@ -1214,19 +1214,24 @@
     }
 
     for (ch = 0; ch < 2; ch++) {
-        if (AV_RN32A(s->non_zero_count_cache[4+ch])) {
+        uint32_t nnz4 = AV_RN32A(s->non_zero_count_cache[4+ch]);
+        if (nnz4) {
             uint8_t *ch_dst = dst[1+ch];
-            for (y = 0; y < 2; y++) {
-                for (x = 0; x < 2; x++) {
-                    int nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
-                    if (nnz) {
-                        if (nnz == 1)
-                            s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
-                        else
-                            s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+            if (nnz4&~0x01010101) {
+                for (y = 0; y < 2; y++) {
+                    for (x = 0; x < 2; x++) {
+                        int nnz = s->non_zero_count_cache[4+ch][(y<<1)+x];
+                        if (nnz) {
+                            if (nnz == 1)
+                                s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+                            else
+                                s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
+                        }
                     }
+                    ch_dst += 4*s->uvlinesize;
                 }
-                ch_dst += 4*s->uvlinesize;
+            } else {
+                s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
             }
         }
     }