diff i386/fft_3dn2.c @ 7263:fc843d00867c libavcodec

exploit mdct symmetry 2% faster vorbis on conroe, k8. 7% on celeron.
author lorenm
date Sun, 13 Jul 2008 15:03:58 +0000
parents f7cbb7733146
children a8a8205a9081
line wrap: on
line diff
--- a/i386/fft_3dn2.c	Sun Jul 13 14:59:39 2008 +0000
+++ b/i386/fft_3dn2.c	Sun Jul 13 15:03:58 2008 +0000
@@ -124,10 +124,9 @@
     asm volatile("femms");
 }
 
-void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
-                        const FFTSample *input, FFTSample *tmp)
+static void imdct_3dn2(MDCTContext *s, const FFTSample *input, FFTSample *tmp)
 {
-    long n8, n4, n2, n;
+    long n4, n2, n;
     x86_reg k;
     const uint16_t *revtab = s->fft.revtab;
     const FFTSample *tcos = s->tcos;
@@ -138,7 +137,6 @@
     n = 1 << s->nbits;
     n2 = n >> 1;
     n4 = n >> 2;
-    n8 = n >> 3;
 
     /* pre rotation */
     in1 = input;
@@ -182,6 +180,20 @@
             :"m"(tcos[k]), "m"(tsin[k])
         );
     }
+}
+
+void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
+                        const FFTSample *input, FFTSample *tmp)
+{
+    x86_reg k;
+    long n8, n2, n;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n2 = n >> 1;
+    n8 = n >> 3;
+
+    imdct_3dn2(s, input, tmp);
 
     k = n-8;
     asm volatile("movd %0, %%mm7" ::"r"(1<<31));
@@ -212,3 +224,40 @@
     asm volatile("femms");
 }
 
+void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output,
+                        const FFTSample *input, FFTSample *tmp)
+{
+    x86_reg j, k;
+    long n8, n4, n;
+    FFTComplex *z = (FFTComplex *)tmp;
+
+    n = 1 << s->nbits;
+    n4 = n >> 2;
+    n8 = n >> 3;
+
+    imdct_3dn2(s, input, tmp);
+
+    j = -n;
+    k = n-8;
+    asm volatile("movd %0, %%mm7" ::"r"(1<<31));
+    asm volatile(
+        "1: \n\t"
+        "movq    (%3,%1), %%mm0 \n\t" // z[n8+k]
+        "pswapd  (%3,%0), %%mm1 \n\t" // z[n8-1-k]
+        "movq      %%mm0, %%mm2 \n\t"
+        "punpckldq %%mm1, %%mm0 \n\t"
+        "punpckhdq %%mm2, %%mm1 \n\t"
+        "pxor      %%mm7, %%mm0 \n\t"
+        "pxor      %%mm7, %%mm1 \n\t"
+        "movq      %%mm0, (%2,%1) \n\t" // output[n4+2*k]   = { -z[n8+k].re, z[n8-1-k].im }
+        "movq      %%mm1, (%2,%0) \n\t" // output[n4-2-2*k] = { -z[n8-1-k].re, z[n8+k].im }
+        "sub $8, %1 \n\t"
+        "add $8, %0 \n\t"
+        "jl 1b \n\t"
+        :"+r"(j), "+r"(k)
+        :"r"(output+n4), "r"(z+n8)
+        :"memory"
+    );
+    asm volatile("femms");
+}
+