Mercurial > libavcodec.hg
annotate ppc/fft_altivec.c @ 12340:2d15f62f4f8a libavcodec
VP8: move zeroing of luma DC block into the WHT
Lets us do the zeroing in asm instead of C.
Also makes it consistent with the way the regular iDCT code does it.
author | darkshikari |
---|---|
date | Mon, 02 Aug 2010 20:18:09 +0000 |
parents | 5638941ec8ef |
children |
rev | line source |
---|---|
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
1 /* |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
2 * FFT/IFFT transforms |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
3 * AltiVec-enabled |
12046 | 4 * Copyright (c) 2009 Loren Merritt |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
5 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
6 * This file is part of FFmpeg. |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
7 * |
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
8 * FFmpeg is free software; you can redistribute it and/or |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
9 * modify it under the terms of the GNU Lesser General Public |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
10 * License as published by the Free Software Foundation; either |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
11 * version 2.1 of the License, or (at your option) any later version. |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
12 * |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
13 * FFmpeg is distributed in the hope that it will be useful, |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
16 * Lesser General Public License for more details. |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
17 * |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
18 * You should have received a copy of the GNU Lesser General Public |
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
19 * License along with FFmpeg; if not, write to the Free Software |
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
21 */ |
11370 | 22 #include "libavcodec/fft.h" |
5750
09f99af1db40
Sanitize altivec code so it can be built with runtime check properly
lu_zero
parents:
5010
diff
changeset
|
23 #include "util_altivec.h" |
12046 | 24 #include "types_altivec.h" |
11382
50415a8f1451
PPC: move prototypes to headers and make some functions static
mru
parents:
11370
diff
changeset
|
25 |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
26 /** |
1879
dd63cb7e5080
fft_*() renamed into ff_fft_*() patch by (Gildas Bazin <gbazin at altern dot org>)
michael
parents:
1352
diff
changeset
|
27 * Do a complex FFT with the parameters defined in ff_fft_init(). The |
975
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
28 * input data must be permuted before with s->revtab table. No |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
29 * 1.0/sqrt(n) normalization is done. |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
30 * AltiVec-enabled |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
31 * This code assumes that the 'z' pointer is 16 bytes-aligned |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
32 * It also assumes all FFTComplex are 8 bytes-aligned pair of float |
e05d525505c5
fft altivec by Romain Dolbeau - simplified patch, test it on PPC with fft-test and wma decoding
bellard
parents:
diff
changeset
|
33 */ |
12046 | 34 |
12089 | 35 void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z); |
36 void ff_fft_calc_interleave_altivec(FFTContext *s, FFTComplex *z); | |
12046 | 37 |
12049 | 38 #if HAVE_GNU_AS |
39 static void ff_imdct_half_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) | |
40 { | |
41 int j, k; | |
42 int n = 1 << s->mdct_bits; | |
43 int n4 = n >> 2; | |
44 int n8 = n >> 3; | |
45 int n32 = n >> 5; | |
46 const uint16_t *revtabj = s->revtab; | |
47 const uint16_t *revtabk = s->revtab+n4; | |
48 const vec_f *tcos = (const vec_f*)(s->tcos+n8); | |
49 const vec_f *tsin = (const vec_f*)(s->tsin+n8); | |
50 const vec_f *pin = (const vec_f*)(input+n4); | |
51 vec_f *pout = (vec_f*)(output+n4); | |
52 | |
53 /* pre rotation */ | |
54 k = n32-1; | |
55 do { | |
56 vec_f cos,sin,cos0,sin0,cos1,sin1,re,im,r0,i0,r1,i1,a,b,c,d; | |
57 #define CMULA(p,o0,o1,o2,o3)\ | |
58 a = pin[ k*2+p]; /* { z[k].re, z[k].im, z[k+1].re, z[k+1].im } */\ | |
59 b = pin[-k*2-p-1]; /* { z[-k-2].re, z[-k-2].im, z[-k-1].re, z[-k-1].im } */\ | |
60 re = vec_perm(a, b, vcprm(0,2,s0,s2)); /* { z[k].re, z[k+1].re, z[-k-2].re, z[-k-1].re } */\ | |
61 im = vec_perm(a, b, vcprm(s3,s1,3,1)); /* { z[-k-1].im, z[-k-2].im, z[k+1].im, z[k].im } */\ | |
62 cos = vec_perm(cos0, cos1, vcprm(o0,o1,s##o2,s##o3)); /* { cos[k], cos[k+1], cos[-k-2], cos[-k-1] } */\ | |
63 sin = vec_perm(sin0, sin1, vcprm(o0,o1,s##o2,s##o3));\ | |
64 r##p = im*cos - re*sin;\ | |
65 i##p = re*cos + im*sin; | |
66 #define STORE2(v,dst)\ | |
67 j = dst;\ | |
68 vec_ste(v, 0, output+j*2);\ | |
69 vec_ste(v, 4, output+j*2); | |
70 #define STORE8(p)\ | |
71 a = vec_perm(r##p, i##p, vcprm(0,s0,0,s0));\ | |
72 b = vec_perm(r##p, i##p, vcprm(1,s1,1,s1));\ | |
73 c = vec_perm(r##p, i##p, vcprm(2,s2,2,s2));\ | |
74 d = vec_perm(r##p, i##p, vcprm(3,s3,3,s3));\ | |
75 STORE2(a, revtabk[ p*2-4]);\ | |
76 STORE2(b, revtabk[ p*2-3]);\ | |
77 STORE2(c, revtabj[-p*2+2]);\ | |
78 STORE2(d, revtabj[-p*2+3]); | |
79 | |
80 cos0 = tcos[k]; | |
81 sin0 = tsin[k]; | |
82 cos1 = tcos[-k-1]; | |
83 sin1 = tsin[-k-1]; | |
84 CMULA(0, 0,1,2,3); | |
85 CMULA(1, 2,3,0,1); | |
86 STORE8(0); | |
87 STORE8(1); | |
88 revtabj += 4; | |
89 revtabk -= 4; | |
90 k--; | |
91 } while(k >= 0); | |
92 | |
12089 | 93 ff_fft_calc_altivec(s, (FFTComplex*)output); |
12049 | 94 |
95 /* post rotation + reordering */ | |
96 j = -n32; | |
97 k = n32-1; | |
98 do { | |
99 vec_f cos,sin,re,im,a,b,c,d; | |
100 #define CMULB(d0,d1,o)\ | |
101 re = pout[o*2];\ | |
102 im = pout[o*2+1];\ | |
103 cos = tcos[o];\ | |
104 sin = tsin[o];\ | |
105 d0 = im*sin - re*cos;\ | |
106 d1 = re*sin + im*cos; | |
107 | |
108 CMULB(a,b,j); | |
109 CMULB(c,d,k); | |
110 pout[2*j] = vec_perm(a, d, vcprm(0,s3,1,s2)); | |
111 pout[2*j+1] = vec_perm(a, d, vcprm(2,s1,3,s0)); | |
112 pout[2*k] = vec_perm(c, b, vcprm(0,s3,1,s2)); | |
113 pout[2*k+1] = vec_perm(c, b, vcprm(2,s1,3,s0)); | |
114 j++; | |
115 k--; | |
116 } while(k >= 0); | |
117 } | |
118 | |
119 static void ff_imdct_calc_altivec(FFTContext *s, FFTSample *output, const FFTSample *input) | |
120 { | |
121 int k; | |
122 int n = 1 << s->mdct_bits; | |
123 int n4 = n >> 2; | |
124 int n16 = n >> 4; | |
125 vec_u32 sign = {1<<31,1<<31,1<<31,1<<31}; | |
126 vec_u32 *p0 = (vec_u32*)(output+n4); | |
127 vec_u32 *p1 = (vec_u32*)(output+n4*3); | |
128 | |
129 ff_imdct_half_altivec(s, output+n4, input); | |
130 | |
131 for (k = 0; k < n16; k++) { | |
132 vec_u32 a = p0[k] ^ sign; | |
133 vec_u32 b = p1[-k-1]; | |
134 p0[-k-1] = vec_perm(a, a, vcprm(3,2,1,0)); | |
135 p1[k] = vec_perm(b, b, vcprm(3,2,1,0)); | |
136 } | |
137 } | |
138 #endif /* HAVE_GNU_AS */ | |
139 | |
10175
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
9364
diff
changeset
|
140 av_cold void ff_fft_init_altivec(FFTContext *s) |
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
9364
diff
changeset
|
141 { |
12052 | 142 #if HAVE_GNU_AS |
12089 | 143 s->fft_calc = ff_fft_calc_interleave_altivec; |
12052 | 144 s->imdct_calc = ff_imdct_calc_altivec; |
145 s->imdct_half = ff_imdct_half_altivec; | |
146 #endif | |
10175
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
9364
diff
changeset
|
147 } |