libavcodec.hg: i386/fft_3dn.c comparison

comparison i386/fft_3dn.c @ 3590:a3d97c60ea07 libavcodec

ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm. 2.5% faster fft, 0.5% faster vorbis.

author	lorenm
date	Fri, 18 Aug 2006 23:53:49 +0000
parents	c20c181e0eca
children	c8c591fe26f8

comparison

equal deleted inserted replaced

-:19c437d6aba5
+:a3d97c60ea07
 /*
 * FFT/MDCT transform with 3DNow! optimizations
-* Copyright (c) 2006 Zuxy MENG Jie.
+* Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #include "../dsputil.h"
-#include <math.h>
-#ifdef HAVE_MM3DNOW
-#include <mm3dnow.h>
 static const int p1m1[2] __attribute__((aligned(8))) =
 { 0, 1 << 31 };
 static const int m1p1[2] __attribute__((aligned(8))) =
 { 1 << 31, 0 };
 void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
 {
 int ln = s->nbits;
-int j, np, np2;
+long i, j;
-int nblocks, nloops;
+long nblocks, nloops;
-register FFTComplex *p, *q;
+FFTComplex *p, *cptr;
-FFTComplex *cptr, *cptr1;
-int k;
-np = 1 << ln;
+asm volatile(
-/* FEMMS not a must here but recommended by AMD */
+/* FEMMS is not a must here but recommended by AMD */
-_m_femms();
+"femms \n\t"
+"movq %0, %%mm7 \n\t"
+::"m"(*(s->inverse ? m1p1 : p1m1))
+);
-{
+i = 8 << ln;
-__m64 *r, a0, a1, b0, b1, tmp, c;
+asm volatile(
+"1: \n\t"
-r = (__m64 *)&z[0];
+"sub $32, %0 \n\t"
-if (s->inverse)
+"movq    (%0,%1), %%mm0 \n\t"
-c = *(__m64 *)m1p1;
+"movq  16(%0,%1), %%mm1 \n\t"
-else
+"movq   8(%0,%1), %%mm2 \n\t"
-c = *(__m64 *)p1m1;
+"movq  24(%0,%1), %%mm3 \n\t"
+"movq      %%mm0, %%mm4 \n\t"
-j = (np >> 2);
+"movq      %%mm1, %%mm5 \n\t"
-do {
+"pfadd     %%mm2, %%mm0 \n\t"
-/* do the pass 0 butterfly */
+"pfadd     %%mm3, %%mm1 \n\t"
-a0 = _m_pfadd(r[0], r[1]);
+"pfsub     %%mm2, %%mm4 \n\t"
-a1 = _m_pfsub(r[0], r[1]);
+"pfsub     %%mm3, %%mm5 \n\t"
+"movq      %%mm0, %%mm2 \n\t"
-/* do the pass 0 butterfly */
+"punpckldq %%mm5, %%mm6 \n\t"
-b0 = _m_pfadd(r[2], r[3]);
+"punpckhdq %%mm6, %%mm5 \n\t"
-b1 = _m_pfsub(r[2], r[3]);
+"movq      %%mm4, %%mm3 \n\t"
+"pxor      %%mm7, %%mm5 \n\t"
-/* multiply third by -i */
+"pfadd     %%mm1, %%mm0 \n\t"
-tmp = _m_punpckhdq(b1, b1);
+"pfadd     %%mm5, %%mm4 \n\t"
-b1 = _m_punpckldq(b1, b1);
+"pfsub     %%mm1, %%mm2 \n\t"
-b1 = _m_punpckldq(tmp, b1);
+"pfsub     %%mm5, %%mm3 \n\t"
-b1 = _m_pxor(b1, c);
+"movq      %%mm0,   (%0,%1) \n\t"
+"movq      %%mm4,  8(%0,%1) \n\t"
-/* do the pass 1 butterfly */
+"movq      %%mm2, 16(%0,%1) \n\t"
-r[0] = _m_pfadd(a0, b0);
+"movq      %%mm3, 24(%0,%1) \n\t"
-r[1] = _m_pfadd(a1, b1);
+"jg 1b \n\t"
-r[2] = _m_pfsub(a0, b0);
+:"+r"(i)
-r[3] = _m_pfsub(a1, b1);
+:"r"(z)
-r += 4;
+);
-} while (--j != 0);
-}
 /* pass 2 .. ln-1 */
-nblocks = np >> 3;
+nblocks = 1 << (ln-3);
 nloops = 1 << 2;
-np2 = np >> 1;
+cptr = s->exptab1;
-cptr1 = s->exptab1;
 do {
 p = z;
-q = z + nloops;
 j = nblocks;
 do {
-cptr = cptr1;
+i = nloops*8;
-k = nloops >> 1;
+asm volatile(
-do {
+"1: \n\t"
-__m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21;
+"sub $16, %0 \n\t"
+"movq    (%1,%0), %%mm0 \n\t"
-a0 = *(__m64 *)&p[0];
+"movq   8(%1,%0), %%mm1 \n\t"
-a1 = *(__m64 *)&p[1];
+"movq    (%2,%0), %%mm2 \n\t"
-b0 = *(__m64 *)&q[0];
+"movq   8(%2,%0), %%mm3 \n\t"
-b1 = *(__m64 *)&q[1];
+"movq      %%mm2, %%mm4 \n\t"
+"movq      %%mm3, %%mm5 \n\t"
-/* complex mul */
+"punpckldq %%mm2, %%mm2 \n\t"
-c0 = *(__m64 *)&cptr[0];
+"punpckldq %%mm3, %%mm3 \n\t"
-c1 = *(__m64 *)&cptr[1];
+"punpckhdq %%mm4, %%mm4 \n\t"
-/*  cre*re cim*re */
+"punpckhdq %%mm5, %%mm5 \n\t"
-t10 = _m_pfmul(c0, _m_punpckldq(b0, b0));
+"pfmul   (%3,%0,2), %%mm2 \n\t" //  cre*re cim*re
-t11 = _m_pfmul(c1, _m_punpckldq(b1, b1));
+"pfmul  8(%3,%0,2), %%mm3 \n\t"
-c0 = *(__m64 *)&cptr[2];
+"pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im
-c1 = *(__m64 *)&cptr[3];
+"pfmul 24(%3,%0,2), %%mm5 \n\t"
-/*  -cim*im cre*im */
+"pfadd     %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
-t20 = _m_pfmul(c0, _m_punpckhdq(b0, b0));
+"pfadd     %%mm3, %%mm5 \n\t"
-t21 = _m_pfmul(c1, _m_punpckhdq(b1, b1));
+"movq      %%mm0, %%mm2 \n\t"
-b0 = _m_pfadd(t10, t20);
+"movq      %%mm1, %%mm3 \n\t"
-b1 = _m_pfadd(t11, t21);
+"pfadd     %%mm4, %%mm0 \n\t"
+"pfadd     %%mm5, %%mm1 \n\t"
-/* butterfly */
+"pfsub     %%mm4, %%mm2 \n\t"
-*(__m64 *)&p[0] = _m_pfadd(a0, b0);
+"pfsub     %%mm5, %%mm3 \n\t"
-*(__m64 *)&p[1] = _m_pfadd(a1, b1);
+"movq      %%mm0,  (%1,%0) \n\t"
-*(__m64 *)&q[0] = _m_pfsub(a0, b0);
+"movq      %%mm1, 8(%1,%0) \n\t"
-*(__m64 *)&q[1] = _m_pfsub(a1, b1);
+"movq      %%mm2,  (%2,%0) \n\t"
+"movq      %%mm3, 8(%2,%0) \n\t"
-p += 2;
+"jg 1b \n\t"
-q += 2;
+:"+r"(i)
-cptr += 4;
+:"r"(p), "r"(p + nloops), "r"(cptr)
-} while (--k);
+);
+p += nloops*2;
-p += nloops;
-q += nloops;
 } while (--j);
-cptr1 += nloops * 2;
+cptr += nloops*2;
-nblocks = nblocks >> 1;
+nblocks >>= 1;
-nloops = nloops << 1;
+nloops <<= 1;
 } while (nblocks != 0);
-_m_femms();
+asm volatile("femms");
 }
-#endif

Mercurial > libavcodec.hg

comparison i386/fft_3dn.c @ 3590:a3d97c60ea07 libavcodec