diff dsputil.c @ 4988:689490842cf5 libavcodec

factor sum_abs_dctelem out of dct_sad, and simd it. sum_abs_dctelem_* alone: core2: c=186 mmx2=39 sse2=21 ssse3=13 (cycles) k8: c=163 mmx2=33 sse2=31 p4: c=370 mmx2=60 sse2=60 dct_sad including sum_abs_dctelem_*: core2: c=405 mmx2=258 sse2=240 ssse3=232 k8: c=624 mmx2=394 sse2=392 p4: c=849 mmx2=556 sse2=556
author lorenm
date Sat, 12 May 2007 02:41:25 +0000
parents 7011f597e473
children f7edc4fe94db
line wrap: on
line diff
--- a/dsputil.c	Sat May 12 01:16:06 2007 +0000
+++ b/dsputil.c	Sat May 12 02:41:25 2007 +0000
@@ -592,6 +592,14 @@
     }
 }
 
+static int sum_abs_dctelem_c(DCTELEM *block)
+{
+    int sum=0, i;
+    for(i=0; i<64; i++)
+        sum+= FFABS(block[i]);
+    return sum;
+}
+
 #if 0
 
 #define PIXOP2(OPNAME, OP) \
@@ -3385,19 +3393,14 @@
 
 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
     MpegEncContext * const s= (MpegEncContext *)c;
-    DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
+    DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
     DCTELEM * const temp= (DCTELEM*)aligned_temp;
-    int sum=0, i;
 
     assert(h==8);
 
     s->dsp.diff_pixels(temp, src1, src2, stride);
     s->dsp.fdct(temp);
-
-    for(i=0; i<64; i++)
-        sum+= FFABS(temp[i]);
-
-    return sum;
+    return s->dsp.sum_abs_dctelem(temp);
 }
 
 #ifdef CONFIG_GPL
@@ -3905,6 +3908,7 @@
     c->add_pixels_clamped = add_pixels_clamped_c;
     c->add_pixels8 = add_pixels8_c;
     c->add_pixels4 = add_pixels4_c;
+    c->sum_abs_dctelem = sum_abs_dctelem_c;
     c->gmc1 = gmc1_c;
     c->gmc = ff_gmc_c;
     c->clear_blocks = clear_blocks_c;