Mercurial > libavcodec.hg
annotate faandct.c @ 2892:41315d0120b3 libavcodec
replace a few mov + psrlq with pshufw, there are more cases which could benefit from this but they would require us to duplicate some functions ...
the trick is from various places (my own code in libpostproc, a patch on the x264 list, ...)
author | michael |
---|---|
date | Wed, 21 Sep 2005 21:17:09 +0000 |
parents | f65d87bfdd5a |
children | ef2149182f1c |
rev | line source |
---|---|
1557 | 1 /* |
2 * Floating point AAN DCT | |
3 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> | |
4 * | |
5 * This library is free software; you can redistribute it and/or | |
6 * modify it under the terms of the GNU Lesser General Public | |
7 * License as published by the Free Software Foundation; either | |
8 * version 2 of the License, or (at your option) any later version. | |
9 * | |
10 * This library is distributed in the hope that it will be useful, | |
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 * Lesser General Public License for more details. | |
14 * | |
15 * You should have received a copy of the GNU Lesser General Public | |
16 * License along with this library; if not, write to the Free Software | |
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
18 * | |
1558 | 19 * this implementation is based upon the IJG integer AAN DCT (see jfdctfst.c) |
1557 | 20 */ |
21 | |
22 /** | |
23 * @file faandct.c | |
24 * @brief | |
25 * Floating point AAN DCT | |
26 * @author Michael Niedermayer <michaelni@gmx.at> | |
27 */ | |
28 | |
29 #include "dsputil.h" | |
30 #include "faandct.h" | |
31 | |
32 #define FLOAT float | |
1562
bf452704100f
optionally merge postscale into quantization table for the float aan dct
michael
parents:
1561
diff
changeset
|
33 #ifdef FAAN_POSTSCALE |
bf452704100f
optionally merge postscale into quantization table for the float aan dct
michael
parents:
1561
diff
changeset
|
34 # define SCALE(x) postscale[x] |
bf452704100f
optionally merge postscale into quantization table for the float aan dct
michael
parents:
1561
diff
changeset
|
35 #else |
bf452704100f
optionally merge postscale into quantization table for the float aan dct
michael
parents:
1561
diff
changeset
|
36 # define SCALE(x) 1 |
bf452704100f
optionally merge postscale into quantization table for the float aan dct
michael
parents:
1561
diff
changeset
|
37 #endif |
1557 | 38 |
39 //numbers generated by simple c code (not as accurate as they could be) | |
40 /* | |
41 for(i=0; i<8; i++){ | |
42 printf("#define B%d %1.20llf\n", i, (long double)1.0/(cosl(i*acosl(-1.0)/(long double)16.0)*sqrtl(2))); | |
43 } | |
44 */ | |
45 #define B0 1.00000000000000000000 | |
46 #define B1 0.72095982200694791383 // (cos(pi*1/16)sqrt(2))^-1 | |
47 #define B2 0.76536686473017954350 // (cos(pi*2/16)sqrt(2))^-1 | |
48 #define B3 0.85043009476725644878 // (cos(pi*3/16)sqrt(2))^-1 | |
49 #define B4 1.00000000000000000000 // (cos(pi*4/16)sqrt(2))^-1 | |
50 #define B5 1.27275858057283393842 // (cos(pi*5/16)sqrt(2))^-1 | |
51 #define B6 1.84775906502257351242 // (cos(pi*6/16)sqrt(2))^-1 | |
52 #define B7 3.62450978541155137218 // (cos(pi*7/16)sqrt(2))^-1 | |
53 | |
54 | |
55 #define A1 0.70710678118654752438 // cos(pi*4/16) | |
56 #define A2 0.54119610014619698435 // cos(pi*6/16)sqrt(2) | |
57 #define A5 0.38268343236508977170 // cos(pi*6/16) | |
58 #define A4 1.30656296487637652774 // cos(pi*2/16)sqrt(2) | |
59 | |
60 static FLOAT postscale[64]={ | |
61 B0*B0, B0*B1, B0*B2, B0*B3, B0*B4, B0*B5, B0*B6, B0*B7, | |
62 B1*B0, B1*B1, B1*B2, B1*B3, B1*B4, B1*B5, B1*B6, B1*B7, | |
63 B2*B0, B2*B1, B2*B2, B2*B3, B2*B4, B2*B5, B2*B6, B2*B7, | |
64 B3*B0, B3*B1, B3*B2, B3*B3, B3*B4, B3*B5, B3*B6, B3*B7, | |
65 B4*B0, B4*B1, B4*B2, B4*B3, B4*B4, B4*B5, B4*B6, B4*B7, | |
66 B5*B0, B5*B1, B5*B2, B5*B3, B5*B4, B5*B5, B5*B6, B5*B7, | |
67 B6*B0, B6*B1, B6*B2, B6*B3, B6*B4, B6*B5, B6*B6, B6*B7, | |
68 B7*B0, B7*B1, B7*B2, B7*B3, B7*B4, B7*B5, B7*B6, B7*B7, | |
69 }; | |
70 | |
1589 | 71 static always_inline void row_fdct(FLOAT temp[64], DCTELEM * data) |
1557 | 72 { |
73 FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
74 FLOAT tmp10, tmp11, tmp12, tmp13; | |
75 FLOAT z1, z2, z3, z4, z5, z11, z13; | |
76 int i; | |
77 | |
78 for (i=0; i<8*8; i+=8) { | |
79 tmp0= data[0 + i] + data[7 + i]; | |
80 tmp7= data[0 + i] - data[7 + i]; | |
81 tmp1= data[1 + i] + data[6 + i]; | |
82 tmp6= data[1 + i] - data[6 + i]; | |
83 tmp2= data[2 + i] + data[5 + i]; | |
84 tmp5= data[2 + i] - data[5 + i]; | |
85 tmp3= data[3 + i] + data[4 + i]; | |
86 tmp4= data[3 + i] - data[4 + i]; | |
87 | |
88 tmp10= tmp0 + tmp3; | |
89 tmp13= tmp0 - tmp3; | |
90 tmp11= tmp1 + tmp2; | |
91 tmp12= tmp1 - tmp2; | |
92 | |
93 temp[0 + i]= tmp10 + tmp11; | |
94 temp[4 + i]= tmp10 - tmp11; | |
95 | |
96 z1= (tmp12 + tmp13)*A1; | |
97 temp[2 + i]= tmp13 + z1; | |
98 temp[6 + i]= tmp13 - z1; | |
99 | |
100 tmp10= tmp4 + tmp5; | |
101 tmp11= tmp5 + tmp6; | |
102 tmp12= tmp6 + tmp7; | |
103 | |
104 z5= (tmp10 - tmp12) * A5; | |
105 z2= tmp10*A2 + z5; | |
106 z4= tmp12*A4 + z5; | |
107 z3= tmp11*A1; | |
108 | |
109 z11= tmp7 + z3; | |
110 z13= tmp7 - z3; | |
111 | |
112 temp[5 + i]= z13 + z2; | |
113 temp[3 + i]= z13 - z2; | |
114 temp[1 + i]= z11 + z4; | |
115 temp[7 + i]= z11 - z4; | |
1589 | 116 } |
117 } | |
118 | |
119 void ff_faandct(DCTELEM * data) | |
120 { | |
121 FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; | |
122 FLOAT tmp10, tmp11, tmp12, tmp13; | |
123 FLOAT z1, z2, z3, z4, z5, z11, z13; | |
124 FLOAT temp[64]; | |
125 int i; | |
126 | |
127 emms_c(); | |
128 | |
129 row_fdct(temp, data); | |
1557 | 130 |
131 for (i=0; i<8; i++) { | |
132 tmp0= temp[8*0 + i] + temp[8*7 + i]; | |
133 tmp7= temp[8*0 + i] - temp[8*7 + i]; | |
134 tmp1= temp[8*1 + i] + temp[8*6 + i]; | |
135 tmp6= temp[8*1 + i] - temp[8*6 + i]; | |
136 tmp2= temp[8*2 + i] + temp[8*5 + i]; | |
137 tmp5= temp[8*2 + i] - temp[8*5 + i]; | |
138 tmp3= temp[8*3 + i] + temp[8*4 + i]; | |
139 tmp4= temp[8*3 + i] - temp[8*4 + i]; | |
140 | |
141 tmp10= tmp0 + tmp3; | |
142 tmp13= tmp0 - tmp3; | |
143 tmp11= tmp1 + tmp2; | |
144 tmp12= tmp1 - tmp2; | |
145 | |
1563
820e06c6ca9b
4.9-RC FreeBSD doesnt like lrint() so change to lrintf(), not that bsd supports that but we emulate it ...
michael
parents:
1562
diff
changeset
|
146 data[8*0 + i]= lrintf(SCALE(8*0 + i) * (tmp10 + tmp11)); |
820e06c6ca9b
4.9-RC FreeBSD doesnt like lrint() so change to lrintf(), not that bsd supports that but we emulate it ...
michael
parents:
1562
diff
changeset
|
147 data[8*4 + i]= lrintf(SCALE(8*4 + i) * (tmp10 - tmp11)); |
1557 | 148 |
149 z1= (tmp12 + tmp13)* A1; | |
1563
820e06c6ca9b
4.9-RC FreeBSD doesnt like lrint() so change to lrintf(), not that bsd supports that but we emulate it ...
michael
parents:
1562
diff
changeset
|
150 data[8*2 + i]= lrintf(SCALE(8*2 + i) * (tmp13 + z1)); |
820e06c6ca9b
4.9-RC FreeBSD doesnt like lrint() so change to lrintf(), not that bsd supports that but we emulate it ...
michael
parents:
1562
diff
changeset
|
151 data[8*6 + i]= lrintf(SCALE(8*6 + i) * (tmp13 - z1)); |
1557 | 152 |
153 tmp10= tmp4 + tmp5; | |
154 tmp11= tmp5 + tmp6; | |
155 tmp12= tmp6 + tmp7; | |
156 | |
157 z5= (tmp10 - tmp12) * A5; | |
158 z2= tmp10*A2 + z5; | |
159 z4= tmp12*A4 + z5; | |
160 z3= tmp11*A1; | |
161 | |
162 z11= tmp7 + z3; | |
163 z13= tmp7 - z3; | |
164 | |
1563
820e06c6ca9b
4.9-RC FreeBSD doesnt like lrint() so change to lrintf(), not that bsd supports that but we emulate it ...
michael
parents:
1562
diff
changeset
|
165 data[8*5 + i]= lrintf(SCALE(8*5 + i) * (z13 + z2)); |
820e06c6ca9b
4.9-RC FreeBSD doesnt like lrint() so change to lrintf(), not that bsd supports that but we emulate it ...
michael
parents:
1562
diff
changeset
|
166 data[8*3 + i]= lrintf(SCALE(8*3 + i) * (z13 - z2)); |
820e06c6ca9b
4.9-RC FreeBSD doesnt like lrint() so change to lrintf(), not that bsd supports that but we emulate it ...
michael
parents:
1562
diff
changeset
|
167 data[8*1 + i]= lrintf(SCALE(8*1 + i) * (z11 + z4)); |
820e06c6ca9b
4.9-RC FreeBSD doesnt like lrint() so change to lrintf(), not that bsd supports that but we emulate it ...
michael
parents:
1562
diff
changeset
|
168 data[8*7 + i]= lrintf(SCALE(8*7 + i) * (z11 - z4)); |
1557 | 169 } |
170 } | |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
171 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
172 void ff_faandct248(DCTELEM * data) |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
173 { |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
174 FLOAT tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
175 FLOAT tmp10, tmp11, tmp12, tmp13; |
1589 | 176 FLOAT z1; |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
177 FLOAT temp[64]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
178 int i; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
179 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
180 emms_c(); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
181 |
1589 | 182 row_fdct(temp, data); |
1571
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
183 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
184 for (i=0; i<8; i++) { |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
185 tmp0 = temp[8*0 + i] + temp[8*1 + i]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
186 tmp1 = temp[8*2 + i] + temp[8*3 + i]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
187 tmp2 = temp[8*4 + i] + temp[8*5 + i]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
188 tmp3 = temp[8*6 + i] + temp[8*7 + i]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
189 tmp4 = temp[8*0 + i] - temp[8*1 + i]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
190 tmp5 = temp[8*2 + i] - temp[8*3 + i]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
191 tmp6 = temp[8*4 + i] - temp[8*5 + i]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
192 tmp7 = temp[8*6 + i] - temp[8*7 + i]; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
193 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
194 tmp10 = tmp0 + tmp3; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
195 tmp11 = tmp1 + tmp2; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
196 tmp12 = tmp1 - tmp2; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
197 tmp13 = tmp0 - tmp3; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
198 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
199 data[8*0 + i] = lrintf(SCALE(8*0 + i) * (tmp10 + tmp11)); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
200 data[8*4 + i] = lrintf(SCALE(8*4 + i) * (tmp10 - tmp11)); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
201 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
202 z1 = (tmp12 + tmp13)* A1; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
203 data[8*2 + i] = lrintf(SCALE(8*2 + i) * (tmp13 + z1)); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
204 data[8*6 + i] = lrintf(SCALE(8*6 + i) * (tmp13 - z1)); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
205 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
206 tmp10 = tmp4 + tmp7; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
207 tmp11 = tmp5 + tmp6; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
208 tmp12 = tmp5 - tmp6; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
209 tmp13 = tmp4 - tmp7; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
210 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
211 data[8*1 + i] = lrintf(SCALE(8*0 + i) * (tmp10 + tmp11)); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
212 data[8*5 + i] = lrintf(SCALE(8*4 + i) * (tmp10 - tmp11)); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
213 |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
214 z1 = (tmp12 + tmp13)* A1; |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
215 data[8*3 + i] = lrintf(SCALE(8*2 + i) * (tmp13 + z1)); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
216 data[8*7 + i] = lrintf(SCALE(8*6 + i) * (tmp13 - z1)); |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
217 } |
aa4dc16c0f18
* adding integer/floating point AAN implementations for DCT 2-4-8
romansh
parents:
1563
diff
changeset
|
218 } |