changeset 5724:96d1b6c30aad libavcodec

unroll encode_residual_lpc(). speedup varies between 1.2x and 1.8x depending on lpc order.
author lorenm
date Thu, 27 Sep 2007 02:42:00 +0000
parents 49a5d44423ef
children 2ec1ec2d1eae
files flacenc.c utils.c
diffstat 2 files changed, 84 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/flacenc.c	Wed Sep 26 13:28:13 2007 +0000
+++ b/flacenc.c	Thu Sep 27 02:42:00 2007 +0000
@@ -834,15 +834,83 @@
     }
 }
 
+#define LPC1(x) {\
+    int s = smp[i-(x)+1];\
+    p1 += c*s;\
+    c = coefs[(x)-2];\
+    p0 += c*s;\
+}
+
+static av_always_inline void encode_residual_lpc_unrolled(
+    int32_t *res, const int32_t *smp, int n,
+    int order, const int32_t *coefs, int shift, int big)
+{
+    int i;
+    for(i=order; i<n; i+=2) {
+        int c = coefs[order-1];
+        int p0 = c * smp[i-order];
+        int p1 = 0;
+        if(big) {
+            switch(order) {
+                case 32: LPC1(32)
+                case 31: LPC1(31)
+                case 30: LPC1(30)
+                case 29: LPC1(29)
+                case 28: LPC1(28)
+                case 27: LPC1(27)
+                case 26: LPC1(26)
+                case 25: LPC1(25)
+                case 24: LPC1(24)
+                case 23: LPC1(23)
+                case 22: LPC1(22)
+                case 21: LPC1(21)
+                case 20: LPC1(20)
+                case 19: LPC1(19)
+                case 18: LPC1(18)
+                case 17: LPC1(17)
+                case 16: LPC1(16)
+                case 15: LPC1(15)
+                case 14: LPC1(14)
+                case 13: LPC1(13)
+                case 12: LPC1(12)
+                case 11: LPC1(11)
+                case 10: LPC1(10)
+                case  9: LPC1( 9)
+                         LPC1( 8)
+                         LPC1( 7)
+                         LPC1( 6)
+                         LPC1( 5)
+                         LPC1( 4)
+                         LPC1( 3)
+                         LPC1( 2)
+            }
+        } else {
+            switch(order) {
+                case  8: LPC1( 8)
+                case  7: LPC1( 7)
+                case  6: LPC1( 6)
+                case  5: LPC1( 5)
+                case  4: LPC1( 4)
+                case  3: LPC1( 3)
+                case  2: LPC1( 2)
+            }
+        }
+        p1 += c * smp[i];
+        res[i  ] = smp[i  ] - (p0 >> shift);
+        res[i+1] = smp[i+1] - (p1 >> shift);
+    }
+}
+
 static void encode_residual_lpc(int32_t *res, const int32_t *smp, int n,
                                 int order, const int32_t *coefs, int shift)
 {
-    int i, j;
-
+    int i;
     for(i=0; i<order; i++) {
         res[i] = smp[i];
     }
+#ifdef CONFIG_SMALL
     for(i=order; i<n; i+=2) {
+        int j;
         int32_t c = coefs[0];
         int32_t p0 = 0, p1 = c*smp[i];
         for(j=1; j<order; j++) {
@@ -855,6 +923,19 @@
         res[i+0] = smp[i+0] - (p0 >> shift);
         res[i+1] = smp[i+1] - (p1 >> shift);
     }
+#else
+    switch(order) {
+        case  1: encode_residual_lpc_unrolled(res, smp, n, 1, coefs, shift, 0); break;
+        case  2: encode_residual_lpc_unrolled(res, smp, n, 2, coefs, shift, 0); break;
+        case  3: encode_residual_lpc_unrolled(res, smp, n, 3, coefs, shift, 0); break;
+        case  4: encode_residual_lpc_unrolled(res, smp, n, 4, coefs, shift, 0); break;
+        case  5: encode_residual_lpc_unrolled(res, smp, n, 5, coefs, shift, 0); break;
+        case  6: encode_residual_lpc_unrolled(res, smp, n, 6, coefs, shift, 0); break;
+        case  7: encode_residual_lpc_unrolled(res, smp, n, 7, coefs, shift, 0); break;
+        case  8: encode_residual_lpc_unrolled(res, smp, n, 8, coefs, shift, 0); break;
+        default: encode_residual_lpc_unrolled(res, smp, n, order, coefs, shift, 1); break;
+    }
+#endif
 }
 
 static int encode_residual(FlacEncodeContext *ctx, int ch)
--- a/utils.c	Wed Sep 26 13:28:13 2007 +0000
+++ b/utils.c	Thu Sep 27 02:42:00 2007 +0000
@@ -670,7 +670,7 @@
 {"context", "context model", OFFSET(context_model), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E},
 {"slice_flags", NULL, OFFSET(slice_flags), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
 {"xvmc_acceleration", NULL, OFFSET(xvmc_acceleration), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX},
-{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|E, "mbd"},
+{"mbd", "macroblock decision algorithm (high quality mode)", OFFSET(mb_decision), FF_OPT_TYPE_INT, DEFAULT, INT_MIN, INT_MAX, V|A|E, "mbd"},
 {"simple", "use mbcmp (default)", 0, FF_OPT_TYPE_CONST, FF_MB_DECISION_SIMPLE, INT_MIN, INT_MAX, V|E, "mbd"},
 {"bits", "use fewest bits", 0, FF_OPT_TYPE_CONST, FF_MB_DECISION_BITS, INT_MIN, INT_MAX, V|E, "mbd"},
 {"rd", "use best rate distortion", 0, FF_OPT_TYPE_CONST, FF_MB_DECISION_RD, INT_MIN, INT_MAX, V|E, "mbd"},