changeset 3975:6cc9eb5ee5e3 libavcodec

x86 branchless cabac decoder slightly faster on P3
author michael
date Mon, 09 Oct 2006 20:51:33 +0000
parents 93746612bc78
children 27e90123b346
files cabac.h
diffstat 1 files changed, 68 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/cabac.h	Mon Oct 09 20:44:11 2006 +0000
+++ b/cabac.h	Mon Oct 09 20:51:33 2006 +0000
@@ -31,6 +31,7 @@
 
 #define CABAC_BITS 16
 #define CABAC_MASK ((1<<CABAC_BITS)-1)
+#define BRANCHLESS_CABAD 1
 
 typedef struct CABACContext{
     int low;
@@ -374,7 +375,7 @@
 #define BYTESTART   "12+2*66*4+4*65"
 #define BYTE        "16+2*66*4+4*65"
 #define BYTEEND     "20+2*66*4+4*65"
-
+#ifndef BRANCHLESS_CABAD
     asm volatile(
         "movzbl (%1), %%eax                     \n\t"
         "movl "RANGE    "(%2), %%ebx            \n\t"
@@ -447,6 +448,72 @@
         : "%ecx", "%ebx", "%edx", "%esi"
     );
 #else
+    asm volatile(
+        "movzbl (%1), %%eax                     \n\t"
+        "movl "RANGE    "(%2), %%ebx            \n\t"
+        "movl "RANGE    "(%2), %%edx            \n\t"
+        "shrl $23, %%ebx                        \n\t"
+        "leal "LPS_RANGE"(%2, %%eax, 4), %%esi  \n\t"
+        "movzbl (%%ebx, %%esi), %%esi           \n\t"
+        "shll $17, %%esi                        \n\t"
+        "movl "LOW      "(%2), %%ebx            \n\t"
+//eax:state ebx:low, edx:range, esi:RangeLPS
+        "subl %%esi, %%edx                      \n\t"
+        "movl %%edx, %%ecx                      \n\t"
+        "subl %%ebx, %%edx                      \n\t"
+        "sarl $31, %%edx                        \n\t" //lps_mask
+        "subl %%ecx, %%esi                      \n\t" //RangeLPS - range
+        "andl %%edx, %%esi                      \n\t" //(RangeLPS - range)&lps_mask
+        "addl %%ecx, %%esi                      \n\t" //new range
+        "andl %%edx, %%ecx                      \n\t"
+        "subl %%ecx, %%ebx                      \n\t"
+
+//eax:state ebx:low edx:mask esi:range
+        "movl $-130, %%ecx                      \n\t"
+        "andl %%edx, %%ecx                      \n\t"
+        "addl %%eax, %%ecx                      \n\t"
+
+        "xorl %%edx, %%eax                      \n\t"
+        "movb "MPS_STATE"(%2, %%eax), %%cl      \n\t"
+        "movb %%cl, (%1)                        \n\t"
+
+        "movl %%esi, %%edx                      \n\t"
+//eax:bit ebx:low edx:range esi:range
+
+        "shr $19, %%esi                         \n\t"
+        "movb " MANGLE(ff_h264_norm_shift) "(%%esi), %%cl   \n\t"
+        "shll %%cl, %%ebx                       \n\t"
+        "shll %%cl, %%edx                       \n\t"
+        "test %%bx, %%bx                        \n\t"
+        " jnz 1f                                \n\t"
+
+        "movl "BYTE     "(%2), %%ecx            \n\t"
+        "movzwl (%%ecx), %%esi                  \n\t"
+        "bswap %%esi                            \n\t"
+        "shrl $15, %%esi                        \n\t"
+        "subl $0xFFFF, %%esi                    \n\t"
+        "addl $2, %%ecx                         \n\t"
+        "movl %%ecx, "BYTE    "(%2)             \n\t"
+
+        "leal -1(%%ebx), %%ecx                  \n\t"
+        "xorl %%ebx, %%ecx                      \n\t"
+        "shrl $17, %%ecx                        \n\t"
+        "movb " MANGLE(ff_h264_norm_shift) "(%%ecx), %%cl   \n\t"
+        "neg %%cl                               \n\t"
+        "add $7, %%cl                           \n\t"
+
+        "shll %%cl , %%esi                      \n\t"
+        "addl %%esi, %%ebx                      \n\t"
+        "1:                                     \n\t"
+        "movl %%edx, "RANGE    "(%2)            \n\t"
+        "movl %%ebx, "LOW      "(%2)            \n\t"
+        "andl $1, %%eax                         \n\t"
+        :"=&a"(bit)
+        :"r"(state), "r"(c)
+        : "%ecx", "%ebx", "%edx", "%esi"
+    );
+#endif
+#else
     int s = *state;
     int RangeLPS= c->lps_range[s][c->range>>(CABAC_BITS+7)]<<(CABAC_BITS+1);
     int bit, lps_mask attribute_unused;