changeset 3969:fc6e0942353b libavcodec

first try of a handwritten get_cabac() for x86, this is 10-20% faster on P3 depening on if you try to subtract the START/STOP_TIMER overhead
author michael
date Mon, 09 Oct 2006 14:15:14 +0000
parents c86c7a54ba92
children 508d759b6906
files cabac.h
diffstat 1 files changed, 85 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/cabac.h	Mon Oct 09 13:37:43 2006 +0000
+++ b/cabac.h	Mon Oct 09 14:15:14 2006 +0000
@@ -364,6 +364,90 @@
 static int get_cabac(CABACContext *c, uint8_t * const state){
     //FIXME gcc generates duplicate load/stores for c->low and c->range
 //START_TIMER
+#ifdef ARCH_X86
+    int bit;
+
+#define LOW          "0"
+#define RANGE        "4"
+#define LPS_RANGE   "12"
+#define LPS_STATE   "12+2*66*4"
+#define MPS_STATE   "12+2*66*4+2*65"
+#define BYTESTART   "12+2*66*4+4*65"
+#define BYTE        "16+2*66*4+4*65"
+#define BYTEEND     "20+2*66*4+4*65"
+
+    asm volatile(
+        "movzbl (%1), %%eax                     \n\t"
+        "movl "RANGE    "(%2), %%ebx            \n\t"
+        "movl "RANGE    "(%2), %%edx            \n\t"
+        "shrl $23, %%ebx                        \n\t"
+        "leal "LPS_RANGE"(%2, %%eax, 4), %%esi  \n\t"
+        "movzbl (%%ebx, %%esi), %%esi           \n\t"
+        "shll $17, %%esi                        \n\t"
+        "movl "LOW      "(%2), %%ebx            \n\t"
+//eax:state ebx:low, edx:range, esi:RangeLPS
+        "subl %%esi, %%edx                      \n\t"
+        "cmpl %%edx, %%ebx                      \n\t"
+        " ja 1f                                 \n\t"
+        "cmp $0x2000000, %%edx                  \n\t" //FIXME avoidable
+        "setb %%cl                              \n\t"
+        "shl %%cl, %%edx                        \n\t"
+        "shl %%cl, %%ebx                        \n\t"
+        "movb "MPS_STATE"(%2, %%eax), %%cl      \n\t"
+        "movb %%cl, (%1)                        \n\t"
+//eax:state ebx:low, edx:range, esi:RangeLPS
+        "test %%bx, %%bx                        \n\t"
+        " jnz 2f                                \n\t"
+        "movl "BYTE     "(%2), %%esi            \n\t"
+        "subl $0xFFFF, %%ebx                    \n\t"
+        "movzwl (%%esi), %%ecx                  \n\t"
+        "bswap %%ecx                            \n\t"
+        "shrl $15, %%ecx                        \n\t"
+        "addl $2, %%esi                         \n\t"
+        "addl %%ecx, %%ebx                      \n\t"
+        "movl %%esi, "BYTE    "(%2)             \n\t"
+        "jmp 2f                                 \n\t"
+        "1:                                     \n\t"
+//eax:state ebx:low, edx:range, esi:RangeLPS
+        "subl %%edx, %%ebx                      \n\t"
+        "movl %%esi, %%edx                      \n\t"
+        "shr $19, %%esi                         \n\t"
+        "movb " MANGLE(ff_h264_norm_shift) "(%%esi), %%cl   \n\t"
+        "shll %%cl, %%ebx                       \n\t"
+        "shll %%cl, %%edx                       \n\t"
+        "movb "LPS_STATE"(%2, %%eax), %%cl      \n\t"
+        "movb %%cl, (%1)                        \n\t"
+        "incl %%eax                             \n\t"
+        "test %%bx, %%bx                        \n\t"
+        " jnz 2f                                \n\t"
+
+        "movl "BYTE     "(%2), %%ecx            \n\t"
+        "movzwl (%%ecx), %%esi                  \n\t"
+        "bswap %%esi                            \n\t"
+        "shrl $15, %%esi                        \n\t"
+        "subl $0xFFFF, %%esi                    \n\t"
+        "addl $2, %%ecx                         \n\t"
+        "movl %%ecx, "BYTE    "(%2)             \n\t"
+
+        "leal -1(%%ebx), %%ecx                  \n\t"
+        "xorl %%ebx, %%ecx                      \n\t"
+        "shrl $17, %%ecx                        \n\t"
+        "movb " MANGLE(ff_h264_norm_shift) "(%%ecx), %%cl   \n\t"
+        "neg %%cl                               \n\t"
+        "add $7, %%cl                           \n\t"
+
+        "shll %%cl , %%esi                      \n\t"
+        "addl %%esi, %%ebx                      \n\t"
+        "2:                                     \n\t"
+        "movl %%edx, "RANGE    "(%2)            \n\t"
+        "movl %%ebx, "LOW      "(%2)            \n\t"
+        "andl $1, %%eax                         \n\t"
+
+        :"=&a"(bit) //FIXME this is fragile gcc either runs out of registers or misscompiles it (for example if "+a"(bit) or "+m"(*state) is used
+        :"r"(state), "r"(c)
+        : "%ecx", "%ebx", "%edx", "%esi"
+    );
+#else
     int s = *state;
     int RangeLPS= c->lps_range[s][c->range>>(CABAC_BITS+7)]<<(CABAC_BITS+1);
     int bit, lps_mask attribute_unused;
@@ -417,6 +501,7 @@
     if(!(c->low & CABAC_MASK))
         refill2(c);
 #endif
+#endif
 //STOP_TIMER("get_cabac")
     return bit;
 }