Mercurial > libavcodec.hg
comparison ppc/fft_altivec.c @ 1009:3b7cc8e4b83f libavcodec
AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
author | michaelni |
---|---|
date | Thu, 16 Jan 2003 21:54:55 +0000 |
parents | edc10966b081 |
children | 35cf2f4a0f8c |
comparison
equal
deleted
inserted
replaced
1008:fb6cbb8a04a3 | 1009:3b7cc8e4b83f |
---|---|
20 */ | 20 */ |
21 #include "../dsputil.h" | 21 #include "../dsputil.h" |
22 | 22 |
23 #include "dsputil_altivec.h" | 23 #include "dsputil_altivec.h" |
24 | 24 |
25 /* | |
26 those three macros are from libavcodec/fft.c | |
27 and are required for the reference C code | |
28 */ | |
29 /* butter fly op */ | |
30 #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ | |
31 {\ | |
32 FFTSample ax, ay, bx, by;\ | |
33 bx=pre1;\ | |
34 by=pim1;\ | |
35 ax=qre1;\ | |
36 ay=qim1;\ | |
37 pre = (bx + ax);\ | |
38 pim = (by + ay);\ | |
39 qre = (bx - ax);\ | |
40 qim = (by - ay);\ | |
41 } | |
42 #define MUL16(a,b) ((a) * (b)) | |
43 #define CMUL(pre, pim, are, aim, bre, bim) \ | |
44 {\ | |
45 pre = (MUL16(are, bre) - MUL16(aim, bim));\ | |
46 pim = (MUL16(are, bim) + MUL16(bre, aim));\ | |
47 } | |
48 | |
49 | |
25 /** | 50 /** |
26 * Do a complex FFT with the parameters defined in fft_init(). The | 51 * Do a complex FFT with the parameters defined in fft_init(). The |
27 * input data must be permuted before with s->revtab table. No | 52 * input data must be permuted before with s->revtab table. No |
28 * 1.0/sqrt(n) normalization is done. | 53 * 1.0/sqrt(n) normalization is done. |
29 * AltiVec-enabled | 54 * AltiVec-enabled |
33 * that successive MUL + ADD/SUB have been merged into | 58 * that successive MUL + ADD/SUB have been merged into |
34 * fused multiply-add ('vec_madd' in altivec) | 59 * fused multiply-add ('vec_madd' in altivec) |
35 */ | 60 */ |
36 void fft_calc_altivec(FFTContext *s, FFTComplex *z) | 61 void fft_calc_altivec(FFTContext *s, FFTComplex *z) |
37 { | 62 { |
63 ALTIVEC_TBL_DECLARE(altivec_fft_num, s->nbits >= 6); | |
64 #ifdef ALTIVEC_USE_REFERENCE_C_CODE | |
65 int ln = s->nbits; | |
66 int j, np, np2; | |
67 int nblocks, nloops; | |
68 register FFTComplex *p, *q; | |
69 FFTComplex *exptab = s->exptab; | |
70 int l; | |
71 FFTSample tmp_re, tmp_im; | |
72 | |
73 ALTIVEC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6); | |
74 | |
75 np = 1 << ln; | |
76 | |
77 /* pass 0 */ | |
78 | |
79 p=&z[0]; | |
80 j=(np >> 1); | |
81 do { | |
82 BF(p[0].re, p[0].im, p[1].re, p[1].im, | |
83 p[0].re, p[0].im, p[1].re, p[1].im); | |
84 p+=2; | |
85 } while (--j != 0); | |
86 | |
87 /* pass 1 */ | |
88 | |
89 | |
90 p=&z[0]; | |
91 j=np >> 2; | |
92 if (s->inverse) { | |
93 do { | |
94 BF(p[0].re, p[0].im, p[2].re, p[2].im, | |
95 p[0].re, p[0].im, p[2].re, p[2].im); | |
96 BF(p[1].re, p[1].im, p[3].re, p[3].im, | |
97 p[1].re, p[1].im, -p[3].im, p[3].re); | |
98 p+=4; | |
99 } while (--j != 0); | |
100 } else { | |
101 do { | |
102 BF(p[0].re, p[0].im, p[2].re, p[2].im, | |
103 p[0].re, p[0].im, p[2].re, p[2].im); | |
104 BF(p[1].re, p[1].im, p[3].re, p[3].im, | |
105 p[1].re, p[1].im, p[3].im, -p[3].re); | |
106 p+=4; | |
107 } while (--j != 0); | |
108 } | |
109 /* pass 2 .. ln-1 */ | |
110 | |
111 nblocks = np >> 3; | |
112 nloops = 1 << 2; | |
113 np2 = np >> 1; | |
114 do { | |
115 p = z; | |
116 q = z + nloops; | |
117 for (j = 0; j < nblocks; ++j) { | |
118 BF(p->re, p->im, q->re, q->im, | |
119 p->re, p->im, q->re, q->im); | |
120 | |
121 p++; | |
122 q++; | |
123 for(l = nblocks; l < np2; l += nblocks) { | |
124 CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im); | |
125 BF(p->re, p->im, q->re, q->im, | |
126 p->re, p->im, tmp_re, tmp_im); | |
127 p++; | |
128 q++; | |
129 } | |
130 | |
131 p += nloops; | |
132 q += nloops; | |
133 } | |
134 nblocks = nblocks >> 1; | |
135 nloops = nloops << 1; | |
136 } while (nblocks != 0); | |
137 | |
138 ALTIVEC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6); | |
139 | |
140 #else /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
38 register const vector float vczero = (const vector float)(0.); | 141 register const vector float vczero = (const vector float)(0.); |
39 | 142 |
40 int ln = s->nbits; | 143 int ln = s->nbits; |
41 int j, np, np2; | 144 int j, np, np2; |
42 int nblocks, nloops; | 145 int nblocks, nloops; |
43 register FFTComplex *p, *q; | 146 register FFTComplex *p, *q; |
44 FFTComplex *cptr, *cptr1; | 147 FFTComplex *cptr, *cptr1; |
45 int k; | 148 int k; |
149 | |
150 ALTIVEC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6); | |
46 | 151 |
47 np = 1 << ln; | 152 np = 1 << ln; |
48 | 153 |
49 { | 154 { |
50 vector float *r, a, b, a1, c1, c2; | 155 vector float *r, a, b, a1, c1, c2; |
127 } while (--j); | 232 } while (--j); |
128 cptr1 += nloops * 2; | 233 cptr1 += nloops * 2; |
129 nblocks = nblocks >> 1; | 234 nblocks = nblocks >> 1; |
130 nloops = nloops << 1; | 235 nloops = nloops << 1; |
131 } while (nblocks != 0); | 236 } while (nblocks != 0); |
237 | |
238 ALTIVEC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6); | |
239 | |
240 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ | |
132 } | 241 } |
133 |