comparison ppc/fft_altivec.c @ 1009:3b7cc8e4b83f libavcodec

AltiVec perf (take 2), plus a couple AltiVec functions by (Romain Dolbeau <dolbeau at irisa dot fr>)
author michaelni
date Thu, 16 Jan 2003 21:54:55 +0000
parents edc10966b081
children 35cf2f4a0f8c
comparison
equal deleted inserted replaced
1008:fb6cbb8a04a3 1009:3b7cc8e4b83f
20 */ 20 */
21 #include "../dsputil.h" 21 #include "../dsputil.h"
22 22
23 #include "dsputil_altivec.h" 23 #include "dsputil_altivec.h"
24 24
25 /*
26 those three macros are from libavcodec/fft.c
27 and are required for the reference C code
28 */
29 /* butter fly op */
30 #define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \
31 {\
32 FFTSample ax, ay, bx, by;\
33 bx=pre1;\
34 by=pim1;\
35 ax=qre1;\
36 ay=qim1;\
37 pre = (bx + ax);\
38 pim = (by + ay);\
39 qre = (bx - ax);\
40 qim = (by - ay);\
41 }
42 #define MUL16(a,b) ((a) * (b))
43 #define CMUL(pre, pim, are, aim, bre, bim) \
44 {\
45 pre = (MUL16(are, bre) - MUL16(aim, bim));\
46 pim = (MUL16(are, bim) + MUL16(bre, aim));\
47 }
48
49
25 /** 50 /**
26 * Do a complex FFT with the parameters defined in fft_init(). The 51 * Do a complex FFT with the parameters defined in fft_init(). The
27 * input data must be permuted before with s->revtab table. No 52 * input data must be permuted before with s->revtab table. No
28 * 1.0/sqrt(n) normalization is done. 53 * 1.0/sqrt(n) normalization is done.
29 * AltiVec-enabled 54 * AltiVec-enabled
33 * that successive MUL + ADD/SUB have been merged into 58 * that successive MUL + ADD/SUB have been merged into
34 * fused multiply-add ('vec_madd' in altivec) 59 * fused multiply-add ('vec_madd' in altivec)
35 */ 60 */
36 void fft_calc_altivec(FFTContext *s, FFTComplex *z) 61 void fft_calc_altivec(FFTContext *s, FFTComplex *z)
37 { 62 {
63 ALTIVEC_TBL_DECLARE(altivec_fft_num, s->nbits >= 6);
64 #ifdef ALTIVEC_USE_REFERENCE_C_CODE
65 int ln = s->nbits;
66 int j, np, np2;
67 int nblocks, nloops;
68 register FFTComplex *p, *q;
69 FFTComplex *exptab = s->exptab;
70 int l;
71 FFTSample tmp_re, tmp_im;
72
73 ALTIVEC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6);
74
75 np = 1 << ln;
76
77 /* pass 0 */
78
79 p=&z[0];
80 j=(np >> 1);
81 do {
82 BF(p[0].re, p[0].im, p[1].re, p[1].im,
83 p[0].re, p[0].im, p[1].re, p[1].im);
84 p+=2;
85 } while (--j != 0);
86
87 /* pass 1 */
88
89
90 p=&z[0];
91 j=np >> 2;
92 if (s->inverse) {
93 do {
94 BF(p[0].re, p[0].im, p[2].re, p[2].im,
95 p[0].re, p[0].im, p[2].re, p[2].im);
96 BF(p[1].re, p[1].im, p[3].re, p[3].im,
97 p[1].re, p[1].im, -p[3].im, p[3].re);
98 p+=4;
99 } while (--j != 0);
100 } else {
101 do {
102 BF(p[0].re, p[0].im, p[2].re, p[2].im,
103 p[0].re, p[0].im, p[2].re, p[2].im);
104 BF(p[1].re, p[1].im, p[3].re, p[3].im,
105 p[1].re, p[1].im, p[3].im, -p[3].re);
106 p+=4;
107 } while (--j != 0);
108 }
109 /* pass 2 .. ln-1 */
110
111 nblocks = np >> 3;
112 nloops = 1 << 2;
113 np2 = np >> 1;
114 do {
115 p = z;
116 q = z + nloops;
117 for (j = 0; j < nblocks; ++j) {
118 BF(p->re, p->im, q->re, q->im,
119 p->re, p->im, q->re, q->im);
120
121 p++;
122 q++;
123 for(l = nblocks; l < np2; l += nblocks) {
124 CMUL(tmp_re, tmp_im, exptab[l].re, exptab[l].im, q->re, q->im);
125 BF(p->re, p->im, q->re, q->im,
126 p->re, p->im, tmp_re, tmp_im);
127 p++;
128 q++;
129 }
130
131 p += nloops;
132 q += nloops;
133 }
134 nblocks = nblocks >> 1;
135 nloops = nloops << 1;
136 } while (nblocks != 0);
137
138 ALTIVEC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
139
140 #else /* ALTIVEC_USE_REFERENCE_C_CODE */
38 register const vector float vczero = (const vector float)(0.); 141 register const vector float vczero = (const vector float)(0.);
39 142
40 int ln = s->nbits; 143 int ln = s->nbits;
41 int j, np, np2; 144 int j, np, np2;
42 int nblocks, nloops; 145 int nblocks, nloops;
43 register FFTComplex *p, *q; 146 register FFTComplex *p, *q;
44 FFTComplex *cptr, *cptr1; 147 FFTComplex *cptr, *cptr1;
45 int k; 148 int k;
149
150 ALTIVEC_TBL_START_COUNT(altivec_fft_num, s->nbits >= 6);
46 151
47 np = 1 << ln; 152 np = 1 << ln;
48 153
49 { 154 {
50 vector float *r, a, b, a1, c1, c2; 155 vector float *r, a, b, a1, c1, c2;
127 } while (--j); 232 } while (--j);
128 cptr1 += nloops * 2; 233 cptr1 += nloops * 2;
129 nblocks = nblocks >> 1; 234 nblocks = nblocks >> 1;
130 nloops = nloops << 1; 235 nloops = nloops << 1;
131 } while (nblocks != 0); 236 } while (nblocks != 0);
237
238 ALTIVEC_TBL_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
239
240 #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
132 } 241 }
133