annotate i386/fft_3dn2.c @ 6920:d02af7474bff libavcodec

Prevent 128*1<<trellis from becoming 0 and creating 0 sized arrays. fixes CID84 RUN2 CID85 RUN2 CID86 RUN2 CID87 RUN2 CID88 RUN2 CID89 RUN2 CID90 RUN2 CID91 RUN2 CID92 RUN2 CID93 RUN2 CID94 RUN2 CID95 RUN2 CID96 RUN2 CID97 RUN2 CID98 RUN2 CID99 RUN2 CID100 RUN2 CID101 RUN2 CID102 RUN2 CID103 RUN2 CID104 RUN2 CID105 RUN2 CID106 RUN2
author michael
date Wed, 28 May 2008 11:59:41 +0000
parents f7cbb7733146
children fc843d00867c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
1 /*
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
2 * FFT/MDCT transform with Extended 3DNow! optimizations
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
3 * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
4 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard.
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
5 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
6 * This file is part of FFmpeg.
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
7 *
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
8 * FFmpeg is free software; you can redistribute it and/or
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
9 * modify it under the terms of the GNU Lesser General Public
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
10 * License as published by the Free Software Foundation; either
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
11 * version 2.1 of the License, or (at your option) any later version.
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
12 *
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
13 * FFmpeg is distributed in the hope that it will be useful,
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
16 * Lesser General Public License for more details.
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
17 *
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
18 * You should have received a copy of the GNU Lesser General Public
3947
c8c591fe26f8 Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents: 3748
diff changeset
19 * License along with FFmpeg; if not, write to the Free Software
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
21 */
6763
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
22
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
23 #include "libavutil/x86_cpu.h"
f7cbb7733146 Use full path for #includes from another directory.
diego
parents: 6755
diff changeset
24 #include "libavcodec/dsputil.h"
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
25
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
26 static const int p1m1[2] __attribute__((aligned(8))) =
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
27 { 0, 1 << 31 };
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
28
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
29 static const int m1p1[2] __attribute__((aligned(8))) =
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
30 { 1 << 31, 0 };
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
31
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
32 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
33 {
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
34 int ln = s->nbits;
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 5010
diff changeset
35 long j;
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 5010
diff changeset
36 x86_reg i;
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
37 long nblocks, nloops;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
38 FFTComplex *p, *cptr;
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
39
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
40 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
41 /* FEMMS is not a must here but recommended by AMD */
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
42 "femms \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
43 "movq %0, %%mm7 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
44 ::"m"(*(s->inverse ? m1p1 : p1m1))
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
45 );
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
46
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
47 i = 8 << ln;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
48 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
49 "1: \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
50 "sub $32, %0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
51 "movq (%0,%1), %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
52 "movq 16(%0,%1), %%mm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
53 "movq 8(%0,%1), %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
54 "movq 24(%0,%1), %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
55 "movq %%mm0, %%mm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
56 "movq %%mm1, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
57 "pfadd %%mm2, %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
58 "pfadd %%mm3, %%mm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
59 "pfsub %%mm2, %%mm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
60 "pfsub %%mm3, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
61 "movq %%mm0, %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
62 "pswapd %%mm5, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
63 "movq %%mm4, %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
64 "pxor %%mm7, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
65 "pfadd %%mm1, %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
66 "pfadd %%mm5, %%mm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
67 "pfsub %%mm1, %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
68 "pfsub %%mm5, %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
69 "movq %%mm0, (%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
70 "movq %%mm4, 8(%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
71 "movq %%mm2, 16(%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
72 "movq %%mm3, 24(%0,%1) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
73 "jg 1b \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
74 :"+r"(i)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
75 :"r"(z)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
76 );
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
77 /* pass 2 .. ln-1 */
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
78
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
79 nblocks = 1 << (ln-3);
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
80 nloops = 1 << 2;
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
81 cptr = s->exptab1;
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
82 do {
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
83 p = z;
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
84 j = nblocks;
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
85 do {
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
86 i = nloops*8;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
87 asm volatile(
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
88 "1: \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
89 "sub $16, %0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
90 "movq (%1,%0), %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
91 "movq 8(%1,%0), %%mm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
92 "movq (%2,%0), %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
93 "movq 8(%2,%0), %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
94 "movq (%3,%0,2), %%mm4 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
95 "movq 8(%3,%0,2), %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
96 "pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3]
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
97 "pswapd %%mm5, %%mm7 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
98 "pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
99 "pfmul %%mm3, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
100 "pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
101 "pfmul %%mm3, %%mm7 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
102 "pfpnacc %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
103 "pfpnacc %%mm7, %%mm5 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
104 "movq %%mm0, %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
105 "movq %%mm1, %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
106 "pfadd %%mm4, %%mm0 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
107 "pfadd %%mm5, %%mm1 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
108 "pfsub %%mm4, %%mm2 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
109 "pfsub %%mm5, %%mm3 \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
110 "movq %%mm0, (%1,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
111 "movq %%mm1, 8(%1,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
112 "movq %%mm2, (%2,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
113 "movq %%mm3, 8(%2,%0) \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
114 "jg 1b \n\t"
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
115 :"+r"(i)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
116 :"r"(p), "r"(p + nloops), "r"(cptr)
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
117 );
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
118 p += nloops*2;
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
119 } while (--j);
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
120 cptr += nloops*2;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
121 nblocks >>= 1;
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
122 nloops <<= 1;
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
123 } while (nblocks != 0);
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
124 asm volatile("femms");
3175
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
125 }
c20c181e0eca 3DNow! & Extended 3DNow! versions of FFT
corey
parents:
diff changeset
126
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
127 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
128 const FFTSample *input, FFTSample *tmp)
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
129 {
6755
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 5010
diff changeset
130 long n8, n4, n2, n;
33896780c612 Do not misuse long as the size of a register in x86.
ramiro
parents: 5010
diff changeset
131 x86_reg k;
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
132 const uint16_t *revtab = s->fft.revtab;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
133 const FFTSample *tcos = s->tcos;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
134 const FFTSample *tsin = s->tsin;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
135 const FFTSample *in1, *in2;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
136 FFTComplex *z = (FFTComplex *)tmp;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
137
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
138 n = 1 << s->nbits;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
139 n2 = n >> 1;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
140 n4 = n >> 2;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
141 n8 = n >> 3;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
142
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
143 /* pre rotation */
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
144 in1 = input;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
145 in2 = input + n2 - 1;
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
146 for(k = 0; k < n4; k++) {
3560
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
147 // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
148 asm volatile(
3560
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
149 "movd %0, %%mm0 \n\t"
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
150 "movd %2, %%mm1 \n\t"
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
151 "punpckldq %1, %%mm0 \n\t"
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
152 "punpckldq %3, %%mm1 \n\t"
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
153 "movq %%mm0, %%mm2 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
154 "pfmul %%mm1, %%mm0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
155 "pswapd %%mm1, %%mm1 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
156 "pfmul %%mm1, %%mm2 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
157 "pfpnacc %%mm2, %%mm0 \n\t"
3560
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
158 ::"m"(in2[-2*k]), "m"(in1[2*k]),
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
159 "m"(tcos[k]), "m"(tsin[k])
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
160 );
f1a16d793fc5 gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
lorenm
parents: 3559
diff changeset
161 asm volatile(
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
162 "movq %%mm0, %0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
163 :"=m"(z[revtab[k]])
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
164 );
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
165 }
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
166
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
167 ff_fft_calc(&s->fft, z);
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
168
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
169 /* post rotation + reordering */
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
170 for(k = 0; k < n4; k++) {
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
171 asm volatile(
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
172 "movq %0, %%mm0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
173 "movd %1, %%mm1 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
174 "punpckldq %2, %%mm1 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
175 "movq %%mm0, %%mm2 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
176 "pfmul %%mm1, %%mm0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
177 "pswapd %%mm1, %%mm1 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
178 "pfmul %%mm1, %%mm2 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
179 "pfpnacc %%mm2, %%mm0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
180 "movq %%mm0, %0 \n\t"
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
181 :"+m"(z[k])
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
182 :"m"(tcos[k]), "m"(tsin[k])
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
183 );
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
184 }
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
185
3747
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
186 k = n-8;
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
187 asm volatile("movd %0, %%mm7" ::"r"(1<<31));
3747
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
188 asm volatile(
3748
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
189 "1: \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
190 "movq (%4,%0), %%mm0 \n\t" // z[n8+k]
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
191 "neg %0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
192 "pswapd -8(%4,%0), %%mm1 \n\t" // z[n8-1-k]
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
193 "movq %%mm0, %%mm2 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
194 "pxor %%mm7, %%mm2 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
195 "punpckldq %%mm1, %%mm2 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
196 "pswapd %%mm2, %%mm3 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
197 "punpckhdq %%mm1, %%mm0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
198 "pswapd %%mm0, %%mm4 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
199 "pxor %%mm7, %%mm0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
200 "pxor %%mm7, %%mm4 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
201 "movq %%mm3, -8(%3,%0) \n\t" // output[n-2-2*k] = { z[n8-1-k].im, -z[n8+k].re }
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
202 "movq %%mm4, -8(%2,%0) \n\t" // output[n2-2-2*k]= { -z[n8-1-k].re, z[n8+k].im }
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
203 "neg %0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
204 "movq %%mm0, (%1,%0) \n\t" // output[2*k] = { -z[n8+k].im, z[n8-1-k].re }
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
205 "movq %%mm2, (%2,%0) \n\t" // output[n2+2*k] = { -z[n8+k].re, z[n8-1-k].im }
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
206 "sub $8, %0 \n\t"
82d22b7e80ff cosmetics (indentation)
lorenm
parents: 3747
diff changeset
207 "jge 1b \n\t"
3747
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
208 :"+r"(k)
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
209 :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8)
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
210 :"memory"
ba1f57431c85 tweak ff_imdct_calc_3dn2
lorenm
parents: 3590
diff changeset
211 );
3561
97325fecd35a emms -> femms
lorenm
parents: 3560
diff changeset
212 asm volatile("femms");
3555
5ea82888103e 3dnow2 implementation of imdct.
lorenm
parents: 3175
diff changeset
213 }
3590
a3d97c60ea07 ff_fft_calc_3dn/3dn2/sse: convert intrinsics to inline asm.
lorenm
parents: 3561
diff changeset
214