annotate i386/fft_3dn2.c @ 6047:2f9c17454842 libavcodec

Add option for user to scale the amount of dynamic range compression which is applied by the audio decoder, and use that option in the AC3 decoder.
author jbr
date Thu, 20 Dec 2007 00:55:08 +0000
parents d5ba514e3f4a
children 33896780c612
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
1 /*
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
2 * FFT/MDCT transform with Extended 3DNow! optimizations
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
3 * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
4 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
5 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
6 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
7 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
8 * FFmpeg is free software; you can redistribute it and/or
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
9 * modify it under the terms of the GNU Lesser General Public
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
10 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
11 * version 2.1 of the License, or (at your option) any later version.
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
12 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
13 * FFmpeg is distributed in the hope that it will be useful,
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
16 * Lesser General Public License for more details.
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
17 *
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
18 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
19 * License along with FFmpeg; if not, write to the Free Software
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
21 */
5010
d5ba514e3f4a Add libavcodec to compiler include flags in order to simplify header
diego
parents: 3947
diff changeset
22 #include "dsputil.h"
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
23
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
24 static const int p1m1[2] __attribute__((aligned(8))) =
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
25 { 0, 1 << 31 };
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
26
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
27 static const int m1p1[2] __attribute__((aligned(8))) =
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
28 { 1 << 31, 0 };
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
29
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
30 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
31 {
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
32 int ln = s->nbits;
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
33 long i, j;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
34 long nblocks, nloops;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
35 FFTComplex *p, *cptr;
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
36
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
37 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
38 /* FEMMS is not a must here but recommended by AMD */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
39 "femms \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
40 "movq %0, %%mm7 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
41 ::"m"(*(s->inverse ? m1p1 : p1m1))
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
42 );
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
43
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
44 i = 8 << ln;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
45 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
46 "1: \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
47 "sub $32, %0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
48 "movq (%0,%1), %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
49 "movq 16(%0,%1), %%mm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
50 "movq 8(%0,%1), %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
51 "movq 24(%0,%1), %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
52 "movq %%mm0, %%mm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
53 "movq %%mm1, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
54 "pfadd %%mm2, %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
55 "pfadd %%mm3, %%mm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
56 "pfsub %%mm2, %%mm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
57 "pfsub %%mm3, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
58 "movq %%mm0, %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
59 "pswapd %%mm5, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
60 "movq %%mm4, %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
61 "pxor %%mm7, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
62 "pfadd %%mm1, %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
63 "pfadd %%mm5, %%mm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
64 "pfsub %%mm1, %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
65 "pfsub %%mm5, %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
66 "movq %%mm0, (%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
67 "movq %%mm4, 8(%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
68 "movq %%mm2, 16(%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
69 "movq %%mm3, 24(%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
70 "jg 1b \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
71 :"+r"(i)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
72 :"r"(z)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
73 );
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
74 /* pass 2 .. ln-1 */
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
75
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
76 nblocks = 1 << (ln-3);
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
77 nloops = 1 << 2;
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
78 cptr = s->exptab1;
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
79 do {
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
80 p = z;
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
81 j = nblocks;
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
82 do {
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
83 i = nloops*8;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
84 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
85 "1: \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
86 "sub $16, %0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
87 "movq (%1,%0), %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
88 "movq 8(%1,%0), %%mm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
89 "movq (%2,%0), %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
90 "movq 8(%2,%0), %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
91 "movq (%3,%0,2), %%mm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
92 "movq 8(%3,%0,2), %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
93 "pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3]
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
94 "pswapd %%mm5, %%mm7 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
95 "pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
96 "pfmul %%mm3, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
97 "pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
98 "pfmul %%mm3, %%mm7 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
99 "pfpnacc %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
100 "pfpnacc %%mm7, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
101 "movq %%mm0, %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
102 "movq %%mm1, %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
103 "pfadd %%mm4, %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
104 "pfadd %%mm5, %%mm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
105 "pfsub %%mm4, %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
106 "pfsub %%mm5, %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
107 "movq %%mm0, (%1,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
108 "movq %%mm1, 8(%1,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
109 "movq %%mm2, (%2,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
110 "movq %%mm3, 8(%2,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
111 "jg 1b \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
112 :"+r"(i)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
113 :"r"(p), "r"(p + nloops), "r"(cptr)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
114 );
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
115 p += nloops*2;
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
116 } while (--j);
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
117 cptr += nloops*2;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
118 nblocks >>= 1;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
119 nloops <<= 1;
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
120 } while (nblocks != 0);
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
121 asm volatile("femms");
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
122 }
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
123
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
124 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
125 const FFTSample *input, FFTSample *tmp)
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
126 {
3559
c02459cd0d31 slightly faster ff_imdct_calc_3dn2() on amd64. (gcc added a bunch of useless movsxd)
lorenm
parents: 3555
diff changeset
127 long k, n8, n4, n2, n;
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
128 const uint16_t *revtab = s->fft.revtab;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
129 const FFTSample *tcos = s->tcos;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
130 const FFTSample *tsin = s->tsin;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
131 const FFTSample *in1, *in2;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
132 FFTComplex *z = (FFTComplex *)tmp;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
133
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
134 n = 1 << s->nbits;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
135 n2 = n >> 1;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
136 n4 = n >> 2;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
137 n8 = n >> 3;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
138
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
139 /* pre rotation */
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
140 in1 = input;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
141 in2 = input + n2 - 1;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
142 for(k = 0; k < n4; k++) {
3560
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
143 // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
144 asm volatile(
3560
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
145 "movd %0, %%mm0 \n\t"
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
146 "movd %2, %%mm1 \n\t"
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
147 "punpckldq %1, %%mm0 \n\t"
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
148 "punpckldq %3, %%mm1 \n\t"
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
149 "movq %%mm0, %%mm2 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
150 "pfmul %%mm1, %%mm0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
151 "pswapd %%mm1, %%mm1 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
152 "pfmul %%mm1, %%mm2 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
153 "pfpnacc %%mm2, %%mm0 \n\t"
3560
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
154 ::"m"(in2[-2*k]), "m"(in1[2*k]),
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
155 "m"(tcos[k]), "m"(tsin[k])
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
156 );
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
157 asm volatile(
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
158 "movq %%mm0, %0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
159 :"=m"(z[revtab[k]])
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
160 );
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
161 }
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
162
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
163 ff_fft_calc(&s->fft, z);
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
164
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
165 /* post rotation + reordering */
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
166 for(k = 0; k < n4; k++) {
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
167 asm volatile(
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
168 "movq %0, %%mm0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
169 "movd %1, %%mm1 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
170 "punpckldq %2, %%mm1 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
171 "movq %%mm0, %%mm2 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
172 "pfmul %%mm1, %%mm0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
173 "pswapd %%mm1, %%mm1 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
174 "pfmul %%mm1, %%mm2 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
175 "pfpnacc %%mm2, %%mm0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
176 "movq %%mm0, %0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
177 :"+m"(z[k])
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
178 :"m"(tcos[k]), "m"(tsin[k])
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
179 );
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
180 }
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
181
3747
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
182 k = n-8;
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
183 asm volatile("movd %0, %%mm7" ::"r"(1<<31));
3747
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
184 asm volatile(
3748
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
185 "1: \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
186 "movq (%4,%0), %%mm0 \n\t" // z[n8+k]
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
187 "neg %0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
188 "pswapd -8(%4,%0), %%mm1 \n\t" // z[n8-1-k]
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
189 "movq %%mm0, %%mm2 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
190 "pxor %%mm7, %%mm2 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
191 "punpckldq %%mm1, %%mm2 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
192 "pswapd %%mm2, %%mm3 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
193 "punpckhdq %%mm1, %%mm0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
194 "pswapd %%mm0, %%mm4 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
195 "pxor %%mm7, %%mm0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
196 "pxor %%mm7, %%mm4 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
197 "movq %%mm3, -8(%3,%0) \n\t" // output[n-2-2*k] = { z[n8-1-k].im, -z[n8+k].re }
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
198 "movq %%mm4, -8(%2,%0) \n\t" // output[n2-2-2*k]= { -z[n8-1-k].re, z[n8+k].im }
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
199 "neg %0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
200 "movq %%mm0, (%1,%0) \n\t" // output[2*k] = { -z[n8+k].im, z[n8-1-k].re }
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
201 "movq %%mm2, (%2,%0) \n\t" // output[n2+2*k] = { -z[n8+k].re, z[n8-1-k].im }
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
202 "sub $8, %0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
203 "jge 1b \n\t"
3747
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
204 :"+r"(k)
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
205 :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8)
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
206 :"memory"
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
207 );
3561
97325fecd35a emms -> femms
lorenm
parents: 3560
diff changeset
208 asm volatile("femms");
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
209 }
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
210