diff vp8dsp.c @ 12238:1a7903913e9b libavcodec

VP8: 30% faster idct_mb Take shortcuts based on statically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks?
author darkshikari
date Fri, 23 Jul 2010 02:58:27 +0000
parents e08d65897115
children c7f6ddcc5c01
line wrap: on
line diff
--- a/vp8dsp.c	Fri Jul 23 01:59:56 2010 +0000
+++ b/vp8dsp.c	Fri Jul 23 02:58:27 2010 +0000
@@ -109,6 +109,25 @@
     }
 }
 
+static void vp8_idct_dc_add4_c(uint8_t *dst, DCTELEM block[4][16], int stride)
+{
+    int i, j;
+    for (j = 0; j < 4; j++) {
+        uint8_t *pix = dst+j*4;
+        int dc = (block[j][0] + 4) >> 3;
+        uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
+        block[j][0] = 0;
+        if (!dc)
+            continue;
+        for (i = 0; i < 4; i++) {
+            pix[0] = cm[pix[0]];
+            pix[1] = cm[pix[1]];
+            pix[2] = cm[pix[2]];
+            pix[3] = cm[pix[3]];
+            pix += stride;
+        }
+    }
+}
 
 // because I like only having two parameters to pass functions...
 #define LOAD_PIXELS\
@@ -460,9 +479,10 @@
 
 av_cold void ff_vp8dsp_init(VP8DSPContext *dsp)
 {
-    dsp->vp8_luma_dc_wht = vp8_luma_dc_wht_c;
-    dsp->vp8_idct_add    = vp8_idct_add_c;
-    dsp->vp8_idct_dc_add = vp8_idct_dc_add_c;
+    dsp->vp8_luma_dc_wht  = vp8_luma_dc_wht_c;
+    dsp->vp8_idct_add     = vp8_idct_add_c;
+    dsp->vp8_idct_dc_add  = vp8_idct_dc_add_c;
+    dsp->vp8_idct_dc_add4 = vp8_idct_dc_add4_c;
 
     dsp->vp8_v_loop_filter16y = vp8_v_loop_filter16_c;
     dsp->vp8_h_loop_filter16y = vp8_h_loop_filter16_c;