changeset 3943:811a9b0d9f32 libavcodec

several x86 renorm_cabac_decoder_once optimizations START/STOP_TIMER benchmarking code for them please benchmark on P4 & athlon (ill remove the benchmarking code and the always slower variants as soon as p4/athlon benchmarks have been posted or commited)
author michael
date Sat, 07 Oct 2006 11:15:10 +0000
parents b04e4818b5bd
children edacbbf46a19
files cabac.h
diffstat 1 files changed, 63 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/cabac.h	Sat Oct 07 09:22:44 2006 +0000
+++ b/cabac.h	Sat Oct 07 11:15:10 2006 +0000
@@ -295,15 +295,77 @@
 }
 
 static inline void renorm_cabac_decoder_once(CABACContext *c){
+#ifdef ARCH_X86
+    int temp;
+#if 0
+    //P3:683
+    asm(
+        "lea -0x20000(%0), %2       \n\t"
+        "shr $31, %2                \n\t"  //FIXME 31->63 for x86-64
+        "shl %%cl, %0               \n\t"
+        "shl %%cl, %1               \n\t"
+        : "+r"(c->range), "+r"(c->low), "+c"(temp)
+    );
+#elif 0
+    //P3:680
+    asm(
+        "cmp $0x20000, %0           \n\t"
+        "setb %%cl                  \n\t"  //FIXME 31->63 for x86-64
+        "shl %%cl, %0               \n\t"
+        "shl %%cl, %1               \n\t"
+        : "+r"(c->range), "+r"(c->low), "+c"(temp)
+    );
+#elif 1
+    int temp2;
+    //P3:665
+    asm(
+        "lea -0x20000(%0), %%eax    \n\t"
+        "cdq                        \n\t"
+        "mov %0, %%eax              \n\t"
+        "and %%edx, %0              \n\t"
+        "and %1, %%edx              \n\t"
+        "add %%eax, %0              \n\t"
+        "add %%edx, %1              \n\t"
+        : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
+    );
+#elif 0
+    int temp2;
+    //P3:673
+    asm(
+        "cmp $0x20000, %0           \n\t"
+        "sbb %%edx, %%edx           \n\t"
+        "mov %0, %%eax              \n\t"
+        "and %%edx, %0              \n\t"
+        "and %1, %%edx              \n\t"
+        "add %%eax, %0              \n\t"
+        "add %%edx, %1              \n\t"
+        : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
+    );
+#else
+    int temp2;
+    //P3:677
+    asm(
+        "cmp $0x20000, %0           \n\t"
+        "lea (%0, %0), %%eax        \n\t"
+        "lea (%1, %1), %%edx        \n\t"
+        "cmovb %%eax, %0            \n\t"
+        "cmovb %%edx, %1            \n\t"
+        : "+r"(c->range), "+r"(c->low), "+a"(temp), "+d"(temp2)
+    );
+#endif
+#else
+    //P3:675
     int shift= (uint32_t)(c->range - (0x200 << CABAC_BITS))>>31;
     c->range<<= shift;
     c->low  <<= shift;
+#endif
     if(!(c->low & CABAC_MASK))
         refill(c);
 }
 
 static int get_cabac(CABACContext *c, uint8_t * const state){
     //FIXME gcc generates duplicate load/stores for c->low and c->range
+START_TIMER
     int s = *state;
     int RangeLPS= c->lps_range[s][c->range>>(CABAC_BITS+7)]<<(CABAC_BITS+1);
     int bit, lps_mask attribute_unused;
@@ -342,7 +404,7 @@
     if(!(c->low & CABAC_MASK))
         refill2(c);
 #endif
-
+STOP_TIMER("get_cabac")
     return bit;
 }