diff vp8.c @ 12253:112b3a0db187 libavcodec

Decode DCT tokens by branching to a different code path for each branch on the huffman tree, instead of traversing the tree in a while loop. Based on the similar optimization in libvpx's detokenize.c 10% faster at normal bitrates, and 30% faster for high-bitrate intra-only
author conrad
date Fri, 23 Jul 2010 21:46:17 +0000
parents 35ee666e4496
children 17c151e1280a
line wrap: on
line diff
--- a/vp8.c	Fri Jul 23 21:46:14 2010 +0000
+++ b/vp8.c	Fri Jul 23 21:46:17 2010 +0000
@@ -800,36 +800,61 @@
                                uint8_t probs[8][3][NUM_DCT_TOKENS-1],
                                int i, int zero_nhood, int16_t qmul[2])
 {
-    int token, nonzero = 0;
-    int offset = 0;
+    uint8_t *token_prob;
+    int nonzero = 0;
+    int coeff;
 
-    for (; i < 16; i++) {
-        token = vp8_rac_get_tree_with_offset(c, vp8_coeff_tree, probs[vp8_coeff_band[i]][zero_nhood], offset);
+    do {
+        token_prob = probs[vp8_coeff_band[i]][zero_nhood];
 
-        if (token == DCT_EOB)
-            break;
-        else if (token >= DCT_CAT1) {
-            int cat = token-DCT_CAT1;
-            token = vp8_rac_get_coeff(c, vp8_dct_cat_prob[cat]);
-            token += 3 + (2<<cat);
+        if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
+            return nonzero;
+
+skip_eob:
+        if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
+            zero_nhood = 0;
+            token_prob = probs[vp8_coeff_band[++i]][0];
+            if (i < 16)
+                goto skip_eob;
+            return nonzero; // invalid input; blocks should end with EOB
         }
 
-        // after the first token, the non-zero prediction context becomes
-        // based on the last decoded coeff
-        if (!token) {
-            zero_nhood = 0;
-            offset = 1;
-            continue;
-        } else if (token == 1)
+        if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
+            coeff = 1;
             zero_nhood = 1;
-        else
+        } else {
             zero_nhood = 2;
 
+            if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
+                coeff = vp56_rac_get_prob(c, token_prob[4]);
+                if (coeff)
+                    coeff += vp56_rac_get_prob(c, token_prob[5]);
+                coeff += 2;
+            } else {
+                // DCT_CAT*
+                if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
+                    if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
+                        coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
+                    } else {                                    // DCT_CAT2
+                        coeff  = 7;
+                        coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
+                        coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
+                    }
+                } else {    // DCT_CAT3 and up
+                    int a = vp56_rac_get_prob(c, token_prob[8]);
+                    int b = vp56_rac_get_prob(c, token_prob[9+a]);
+                    int cat = (a<<1) + b;
+                    coeff  = 3 + (8<<cat);
+                    coeff += vp8_rac_get_coeff(c, vp8_dct_cat_prob[cat]);
+                }
+            }
+        }
+
         // todo: full [16] qmat? load into register?
-        block[zigzag_scan[i]] = (vp8_rac_get(c) ? -token : token) * qmul[!!i];
-        nonzero = i+1;
-        offset = 0;
-    }
+        block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
+        nonzero = ++i;
+    } while (i < 16);
+
     return nonzero;
 }