Mercurial > libavcodec.hg
annotate i386/vp3dsp_sse2.c @ 2942:868c48736d1c libavcodec
fixed long standing off-by-one bug (fixes playback on ppc)
author | alex |
---|---|
date | Sun, 13 Nov 2005 17:48:27 +0000 |
parents | fd5d7c732c6b |
children | ef2149182f1c |
rev | line source |
---|---|
1970 | 1 /* |
2 * Copyright (C) 2004 the ffmpeg project | |
3 * | |
4 * This library is free software; you can redistribute it and/or | |
5 * modify it under the terms of the GNU Lesser General Public | |
6 * License as published by the Free Software Foundation; either | |
7 * version 2 of the License, or (at your option) any later version. | |
8 * | |
9 * This library is distributed in the hope that it will be useful, | |
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 * Lesser General Public License for more details. | |
13 * | |
14 * You should have received a copy of the GNU Lesser General Public | |
15 * License along with this library; if not, write to the Free Software | |
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
17 */ | |
18 | |
19 /** | |
20 * @file vp3dsp_sse2.c | |
21 * SSE2-optimized functions cribbed from the original VP3 source code. | |
22 */ | |
23 | |
24 #include "../dsputil.h" | |
25 #include "mmx.h" | |
26 | |
2753 | 27 static const unsigned short __align16 SSE2_dequant_const[] = |
1970 | 28 { |
29 0,65535,65535,0,0,0,0,0, // 0x0000 0000 0000 0000 0000 FFFF FFFF 0000 | |
30 0,0,0,0,65535,65535,0,0, // 0x0000 0000 FFFF FFFF 0000 0000 0000 0000 | |
31 65535,65535,65535,0,0,0,0,0,// 0x0000 0000 0000 0000 0000 FFFF FFFF FFFF | |
32 0,0,0,65535,0,0,0,0, // 0x0000 0000 0000 0000 FFFF 0000 0000 0000 | |
33 0,0,0,65535,65535,0,0,0, // 0x0000 0000 0000 FFFF FFFF 0000 0000 0000 | |
34 65535,0,0,0,0,65535,0,0, // 0x0000 0000 FFFF 0000 0000 0000 0000 FFFF | |
35 0,0,65535,65535, 0,0,0,0 // 0x0000 0000 0000 0000 FFFF FFFF 0000 0000 | |
36 }; | |
37 | |
2753 | 38 static const unsigned int __align16 eight_data[] = |
1970 | 39 { |
40 0x00080008, | |
41 0x00080008, | |
42 0x00080008, | |
43 0x00080008 | |
44 }; | |
45 | |
2753 | 46 static const unsigned short __align16 SSE2_idct_data[7 * 8] = |
1970 | 47 { |
48 64277,64277,64277,64277,64277,64277,64277,64277, | |
49 60547,60547,60547,60547,60547,60547,60547,60547, | |
50 54491,54491,54491,54491,54491,54491,54491,54491, | |
51 46341,46341,46341,46341,46341,46341,46341,46341, | |
52 36410,36410,36410,36410,36410,36410,36410,36410, | |
53 25080,25080,25080,25080,25080,25080,25080,25080, | |
54 12785,12785,12785,12785,12785,12785,12785,12785 | |
55 }; | |
56 | |
57 | |
58 #define SSE2_Column_IDCT() { \ | |
59 \ | |
60 movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \ | |
61 movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \ | |
62 \ | |
63 movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \ | |
64 movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \ | |
65 \ | |
66 pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \ | |
67 movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \ | |
68 \ | |
69 pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \ | |
70 movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \ | |
71 \ | |
72 pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \ | |
73 movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \ | |
74 \ | |
75 pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \ | |
76 movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \ | |
77 \ | |
78 /* all registers are in use */ \ | |
79 \ | |
80 paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \ | |
81 paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \ | |
82 \ | |
83 paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \ | |
84 movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \ | |
85 \ | |
86 paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \ | |
87 movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \ | |
88 \ | |
89 pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \ | |
90 paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \ | |
91 \ | |
92 pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \ | |
93 movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \ | |
94 \ | |
95 psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \ | |
96 paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \ | |
97 \ | |
98 pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \ | |
99 movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \ | |
100 \ | |
101 pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \ | |
102 paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \ | |
103 \ | |
104 movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \ | |
105 pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \ | |
106 \ | |
107 psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \ | |
108 movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \ | |
109 \ | |
110 paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \ | |
111 movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \ | |
112 \ | |
113 psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \ | |
114 pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \ | |
115 \ | |
116 paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \ | |
117 pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \ | |
118 \ | |
119 paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \ | |
120 paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \ | |
121 \ | |
122 psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \ | |
123 paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \ | |
124 \ | |
125 paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \ | |
126 pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \ | |
127 \ | |
128 paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \ | |
129 movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \ | |
130 \ | |
131 psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \ | |
132 movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \ | |
133 \ | |
134 movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \ | |
135 pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ | |
136 \ | |
137 paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \ | |
138 movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \ | |
139 \ | |
140 movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \ | |
141 movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \ | |
142 \ | |
143 pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ | |
144 paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \ | |
145 \ | |
146 movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \ | |
147 psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \ | |
148 \ | |
149 paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \ | |
150 psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \ | |
151 \ | |
152 movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \ | |
153 pmulhw_r2r(xmm4, xmm6); /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \ | |
154 \ | |
155 paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \ | |
156 paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \ | |
157 \ | |
158 paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \ | |
159 paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \ | |
160 \ | |
161 pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ | |
162 paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \ | |
163 \ | |
164 psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \ | |
165 paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \ | |
166 \ | |
167 movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \ | |
168 paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \ | |
169 \ | |
170 paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ | |
171 psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \ | |
172 \ | |
173 paddsw_m2r(*Eight, xmm2); /* Adjust R2 and R1 before shifting */ \ | |
174 paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \ | |
175 \ | |
176 paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \ | |
177 psraw_i2r(4, xmm2); /* xmm2 = op2 */ \ | |
178 \ | |
179 psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \ | |
180 psraw_i2r(4, xmm1); /* xmm1 = op1 */ \ | |
181 \ | |
182 movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \ | |
183 paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \ | |
184 \ | |
185 movdqu_r2m(xmm2, *O(2)); /* Write out op2 */ \ | |
186 paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \ | |
187 \ | |
188 movdqu_r2m(xmm1, *O(1)); /* Write out op1 */ \ | |
189 psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \ | |
190 \ | |
191 paddsw_m2r(*Eight, xmm4); /* Adjust R4 and R3 before shifting */ \ | |
192 paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \ | |
193 \ | |
194 paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \ | |
195 psraw_i2r(4, xmm4); /* xmm4 = op4 */ \ | |
196 \ | |
197 psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \ | |
198 psraw_i2r(4, xmm3); /* xmm3 = op3 */ \ | |
199 \ | |
200 paddsw_m2r(*Eight, xmm6); /* Adjust R6 and R5 before shifting */ \ | |
201 paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \ | |
202 \ | |
203 paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \ | |
204 psraw_i2r(4, xmm6); /* xmm6 = op6 */ \ | |
205 \ | |
206 movdqu_r2m(xmm4, *O(4)); /* Write out op4 */ \ | |
207 psraw_i2r(4, xmm5); /* xmm5 = op5 */ \ | |
208 \ | |
209 movdqu_r2m(xmm3, *O(3)); /* Write out op3 */ \ | |
210 psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \ | |
211 \ | |
212 paddsw_m2r(*Eight, xmm7); /* Adjust R7 and R0 before shifting */ \ | |
213 paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \ | |
214 \ | |
215 paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \ | |
216 psraw_i2r(4, xmm7); /* xmm7 = op7 */ \ | |
217 \ | |
218 movdqu_r2m(xmm6, *O(6)); /* Write out op6 */ \ | |
219 psraw_i2r(4, xmm0); /* xmm0 = op0 */ \ | |
220 \ | |
221 movdqu_r2m(xmm5, *O(5)); /* Write out op5 */ \ | |
222 movdqu_r2m(xmm7, *O(7)); /* Write out op7 */ \ | |
223 \ | |
224 movdqu_r2m(xmm0, *O(0)); /* Write out op0 */ \ | |
225 \ | |
226 } /* End of SSE2_Column_IDCT macro */ | |
227 | |
228 | |
229 #define SSE2_Row_IDCT() { \ | |
230 \ | |
231 movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \ | |
232 movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \ | |
233 \ | |
234 movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \ | |
235 movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \ | |
236 \ | |
237 pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \ | |
238 movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \ | |
239 \ | |
240 pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \ | |
241 movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \ | |
242 \ | |
243 pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \ | |
244 movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \ | |
245 \ | |
246 pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \ | |
247 movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \ | |
248 \ | |
249 /* all registers are in use */ \ | |
250 \ | |
251 paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \ | |
252 paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \ | |
253 \ | |
254 paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \ | |
255 movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \ | |
256 \ | |
257 paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \ | |
258 movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \ | |
259 \ | |
260 pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \ | |
261 paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \ | |
262 \ | |
263 pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \ | |
264 movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \ | |
265 \ | |
266 psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \ | |
267 paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \ | |
268 \ | |
269 pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \ | |
270 movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \ | |
271 \ | |
272 pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \ | |
273 paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \ | |
274 \ | |
275 movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \ | |
276 pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \ | |
277 \ | |
278 psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \ | |
279 movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \ | |
280 \ | |
281 paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \ | |
282 movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \ | |
283 \ | |
284 psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \ | |
285 pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \ | |
286 \ | |
287 paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \ | |
288 pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \ | |
289 \ | |
290 paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \ | |
291 paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \ | |
292 \ | |
293 psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \ | |
294 paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \ | |
295 \ | |
296 paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \ | |
297 pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \ | |
298 \ | |
299 paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \ | |
300 movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \ | |
301 \ | |
302 psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \ | |
303 movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \ | |
304 \ | |
305 movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \ | |
306 pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ | |
307 \ | |
308 paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \ | |
309 movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \ | |
310 \ | |
311 movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \ | |
312 movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \ | |
313 \ | |
314 pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ | |
315 paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \ | |
316 \ | |
317 movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \ | |
318 psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \ | |
319 \ | |
320 paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \ | |
321 psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \ | |
322 \ | |
323 movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \ | |
324 pmulhw_r2r(xmm4, xmm6); /* xmm6 = ( c4 - 1 ) * ( i0 - i4 ) = F */ \ | |
325 \ | |
326 paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \ | |
327 paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \ | |
328 \ | |
329 paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \ | |
330 paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \ | |
331 \ | |
332 pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ | |
333 paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \ | |
334 \ | |
335 psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \ | |
336 paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \ | |
337 \ | |
338 movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \ | |
339 paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \ | |
340 \ | |
341 paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ | |
342 psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \ | |
343 \ | |
344 paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \ | |
345 paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \ | |
346 \ | |
347 psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \ | |
348 \ | |
349 movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \ | |
350 paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \ | |
351 \ | |
352 movdqu_r2m(xmm2, *I(2)); /* Write out op2 */ \ | |
353 paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \ | |
354 \ | |
355 movdqu_r2m(xmm1, *I(1)); /* Write out op1 */ \ | |
356 psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \ | |
357 \ | |
358 paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \ | |
359 \ | |
360 paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \ | |
361 \ | |
362 psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \ | |
363 \ | |
364 paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \ | |
365 \ | |
366 paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \ | |
367 \ | |
368 movdqu_r2m(xmm4, *I(4)); /* Write out op4 */ \ | |
369 \ | |
370 movdqu_r2m(xmm3, *I(3)); /* Write out op3 */ \ | |
371 psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \ | |
372 \ | |
373 paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \ | |
374 \ | |
375 paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \ | |
376 \ | |
377 movdqu_r2m(xmm6, *I(6)); /* Write out op6 */ \ | |
378 \ | |
379 movdqu_r2m(xmm5, *I(5)); /* Write out op5 */ \ | |
380 movdqu_r2m(xmm7, *I(7)); /* Write out op7 */ \ | |
381 \ | |
382 movdqu_r2m(xmm0, *I(0)); /* Write out op0 */ \ | |
383 \ | |
384 } /* End of SSE2_Row_IDCT macro */ | |
385 | |
386 | |
387 #define SSE2_Transpose() { \ | |
388 \ | |
389 movdqu_m2r(*I(4), xmm4); /* xmm4=e7e6e5e4e3e2e1e0 */ \ | |
390 movdqu_m2r(*I(5), xmm0); /* xmm4=f7f6f5f4f3f2f1f0 */ \ | |
391 \ | |
392 movdqu_r2r(xmm4, xmm5); /* make a copy */ \ | |
393 punpcklwd_r2r(xmm0, xmm4); /* xmm4=f3e3f2e2f1e1f0e0 */ \ | |
394 \ | |
395 punpckhwd_r2r(xmm0, xmm5); /* xmm5=f7e7f6e6f5e5f4e4 */ \ | |
396 movdqu_m2r(*I(6), xmm6); /* xmm6=g7g6g5g4g3g2g1g0 */ \ | |
397 \ | |
398 movdqu_m2r(*I(7), xmm0); /* xmm0=h7h6h5h4h3h2h1h0 */ \ | |
399 movdqu_r2r(xmm6, xmm7); /* make a copy */ \ | |
400 \ | |
401 punpcklwd_r2r(xmm0, xmm6); /* xmm6=h3g3h3g2h1g1h0g0 */ \ | |
402 punpckhwd_r2r(xmm0, xmm7); /* xmm7=h7g7h6g6h5g5h4g4 */ \ | |
403 \ | |
404 movdqu_r2r(xmm4, xmm3); /* make a copy */ \ | |
405 punpckldq_r2r(xmm6, xmm4); /* xmm4=h1g1f1e1h0g0f0e0 */ \ | |
406 \ | |
407 punpckhdq_r2r(xmm6, xmm3); /* xmm3=h3g3g3e3h2g2f2e2 */ \ | |
408 movdqu_r2m(xmm3, *I(6)); /* save h3g3g3e3h2g2f2e2 */ \ | |
409 /* Free xmm6 */ \ | |
410 movdqu_r2r(xmm5, xmm6); /* make a copy */ \ | |
411 punpckldq_r2r(xmm7, xmm5); /* xmm5=h5g5f5e5h4g4f4e4 */ \ | |
412 \ | |
413 punpckhdq_r2r(xmm7, xmm6); /* xmm6=h7g7f7e7h6g6f6e6 */ \ | |
414 movdqu_m2r(*I(0), xmm0); /* xmm0=a7a6a5a4a3a2a1a0 */ \ | |
415 /* Free xmm7 */ \ | |
416 movdqu_m2r(*I(1), xmm1); /* xmm1=b7b6b5b4b3b2b1b0 */ \ | |
417 movdqu_r2r(xmm0, xmm7); /* make a copy */ \ | |
418 \ | |
419 punpcklwd_r2r(xmm1, xmm0); /* xmm0=b3a3b2a2b1a1b0a0 */ \ | |
420 punpckhwd_r2r(xmm1, xmm7); /* xmm7=b7a7b6a6b5a5b4a4 */ \ | |
421 /* Free xmm1 */ \ | |
422 movdqu_m2r(*I(2), xmm2); /* xmm2=c7c6c5c4c3c2c1c0 */ \ | |
423 movdqu_m2r(*I(3), xmm3); /* xmm3=d7d6d5d4d3d2d1d0 */ \ | |
424 \ | |
425 movdqu_r2r(xmm2, xmm1); /* make a copy */ \ | |
426 punpcklwd_r2r(xmm3, xmm2); /* xmm2=d3c3d2c2d1c1d0c0 */ \ | |
427 \ | |
428 punpckhwd_r2r(xmm3, xmm1); /* xmm1=d7c7d6c6d5c5d4c4 */ \ | |
429 movdqu_r2r(xmm0, xmm3); /* make a copy */ \ | |
430 \ | |
431 punpckldq_r2r(xmm2, xmm0); /* xmm0=d1c1b1a1d0c0b0a0 */ \ | |
432 punpckhdq_r2r(xmm2, xmm3); /* xmm3=d3c3b3a3d2c2b2a2 */ \ | |
433 /* Free xmm2 */ \ | |
434 movdqu_r2r(xmm7, xmm2); /* make a copy */ \ | |
435 punpckldq_r2r(xmm1, xmm2); /* xmm2=d5c5b5a5d4c4b4a4 */ \ | |
436 \ | |
437 punpckhdq_r2r(xmm1, xmm7); /* xmm7=d7c7b7a7d6c6b6a6 */ \ | |
438 movdqu_r2r(xmm0, xmm1); /* make a copy */ \ | |
439 \ | |
440 punpcklqdq_r2r(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */ \ | |
441 punpckhqdq_r2r(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */ \ | |
442 \ | |
443 movdqu_r2m(xmm0, *I(0)); /* save I(0) */ \ | |
444 movdqu_r2m(xmm1, *I(1)); /* save I(1) */ \ | |
445 \ | |
446 movdqu_m2r(*I(6), xmm0); /* load h3g3g3e3h2g2f2e2 */ \ | |
447 movdqu_r2r(xmm3, xmm1); /* make a copy */ \ | |
448 \ | |
449 punpcklqdq_r2r(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */ \ | |
450 punpckhqdq_r2r(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */ \ | |
451 \ | |
452 movdqu_r2r(xmm2, xmm4); /* make a copy */ \ | |
453 punpcklqdq_r2r(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */ \ | |
454 \ | |
455 punpckhqdq_r2r(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */ \ | |
456 movdqu_r2m(xmm1, *I(2)); /* save I(2) */ \ | |
457 \ | |
458 movdqu_r2m(xmm3, *I(3)); /* save I(3) */ \ | |
459 movdqu_r2m(xmm4, *I(4)); /* save I(4) */ \ | |
460 \ | |
461 movdqu_r2m(xmm2, *I(5)); /* save I(5) */ \ | |
462 movdqu_r2r(xmm7, xmm5); /* make a copy */ \ | |
463 \ | |
464 punpcklqdq_r2r(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */ \ | |
465 punpckhqdq_r2r(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */ \ | |
466 \ | |
467 movdqu_r2m(xmm5, *I(6)); /* save I(6) */ \ | |
468 movdqu_r2m(xmm7, *I(7)); /* save I(7) */ \ | |
469 \ | |
470 } /* End of Transpose Macro */ | |
471 | |
472 | |
473 #define SSE2_Dequantize() { \ | |
474 movdqu_m2r(*(eax), xmm0); \ | |
475 \ | |
476 pmullw_m2r(*(ebx), xmm0); /* xmm0 = 07 06 05 04 03 02 01 00 */ \ | |
477 movdqu_m2r(*(eax + 16), xmm1); \ | |
478 \ | |
479 pmullw_m2r(*(ebx + 16), xmm1); /* xmm1 = 17 16 15 14 13 12 11 10 */ \ | |
480 pshuflw_r2r(xmm0, xmm3, 0x078); /* xmm3 = 07 06 05 04 01 03 02 00 */ \ | |
481 \ | |
482 movdqu_r2r(xmm1, xmm2); /* xmm2 = 17 16 15 14 13 12 11 10 */ \ | |
483 movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \ | |
484 \ | |
485 movdqu_m2r(*(eax + 32), xmm4); \ | |
486 movdqu_m2r(*(eax + 64), xmm5); \ | |
487 \ | |
488 pmullw_m2r(*(ebx + 32), xmm4); /* xmm4 = 27 26 25 24 23 22 21 20 */ \ | |
489 pmullw_m2r(*(ebx + 64), xmm5); /* xmm5 = 47 46 45 44 43 42 41 40 */ \ | |
490 \ | |
491 movdqu_m2r(*(ecx + 16), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \ | |
492 pand_r2r(xmm2, xmm7); /* xmm7 = -- -- -- -- -- 12 11 -- */ \ | |
493 \ | |
494 pand_r2r(xmm4, xmm6); /* xmm6 = -- -- 25 24 -- -- -- -- */ \ | |
495 pxor_r2r(xmm7, xmm2); /* xmm2 = 17 16 15 14 13 -- -- 10 */ \ | |
496 \ | |
497 pxor_r2r(xmm6, xmm4); /* xmm4 = 27 26 -- -- 23 22 21 20 */ \ | |
498 pslldq_i2r(4, xmm7); /* xmm7 = -- -- -- 12 11 -- -- -- */ \ | |
499 \ | |
500 pslldq_i2r(2, xmm6); /* xmm6 = -- 25 24 -- -- -- -- -- */ \ | |
501 por_r2r(xmm6, xmm7); /* xmm7 = -- 25 24 12 11 -- -- -- */ \ | |
502 \ | |
503 movdqu_m2r(*(ecx + 32), xmm0); /* xmm0 = -- -- -- -- -- FF FF FF */ \ | |
504 movdqu_m2r(*(ecx + 48), xmm6); /* xmm6 = -- -- -- -- FF -- -- -- */ \ | |
505 \ | |
506 pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- -- -- 03 02 00 */ \ | |
507 pand_r2r(xmm5, xmm6); /* xmm6 = -- -- -- -- 43 -- -- -- */ \ | |
508 \ | |
509 pxor_r2r(xmm0, xmm3); /* xmm3 = 07 06 05 04 01 -- -- -- */ \ | |
510 pxor_r2r(xmm6, xmm5); /* xmm5 = 47 46 45 44 -- 42 41 40 */ \ | |
511 \ | |
512 por_r2r(xmm7, xmm0); /* xmm0 = -- 25 24 12 11 03 02 00 */ \ | |
513 pslldq_i2r(8, xmm6); /* xmm6 = 43 -- -- -- -- -- -- -- */ \ | |
514 \ | |
515 por_r2r(xmm6, xmm0); /* xmm0 = 43 25 24 12 11 03 02 00 */ \ | |
516 /* 02345 in use */ \ | |
517 \ | |
518 movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ | |
519 pshuflw_r2r(xmm5, xmm5, 0x0B4); /* xmm5 = 47 46 45 44 42 -- 41 40 */ \ | |
520 \ | |
521 movdqu_r2r(xmm1, xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ | |
522 movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
523 \ | |
524 movdqu_r2m(xmm0, *(eax)); /* write 43 25 24 12 11 03 02 00 */ \ | |
525 pshufhw_r2r(xmm4, xmm4, 0x0C2); /* xmm4 = 27 -- -- 26 23 22 21 20 */ \ | |
526 \ | |
527 pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- 26 23 -- -- -- */ \ | |
528 pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 44 42 -- -- -- */ \ | |
529 \ | |
530 pxor_r2r(xmm7, xmm4); /* xmm4 = 27 -- -- -- -- 22 21 20 */ \ | |
531 pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 45 -- -- -- 41 40 */ \ | |
532 \ | |
533 pshuflw_r2r(xmm2, xmm2, 0x0C6); /* xmm2 = 17 16 15 14 13 10 -- -- */ \ | |
534 movdqu_r2r(xmm6, xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ | |
535 \ | |
536 pslldq_i2r(2, xmm7); /* xmm7 = -- -- 26 23 -- -- -- -- */ \ | |
537 pslldq_i2r(6, xmm1); /* xmm1 = 44 42 -- -- -- -- -- -- */ \ | |
538 \ | |
539 psrldq_i2r(2, xmm0); /* xmm0 = -- -- -- -- FF FF -- -- */ \ | |
540 pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- 04 01 -- -- -- */ \ | |
541 \ | |
542 pand_r2r(xmm2, xmm0); /* xmm0 = -- -- -- -- 13 10 -- -- */ \ | |
543 pxor_r2r(xmm6, xmm3); /* xmm3 = 07 06 05 -- -- -- -- -- */ \ | |
544 \ | |
545 pxor_r2r(xmm0, xmm2); /* xmm2 = 17 16 15 14 -- -- -- -- */ \ | |
546 psrldq_i2r(6, xmm6); /* xmm0 = -- -- -- -- -- -- 04 01 */ \ | |
547 \ | |
548 por_r2r(xmm7, xmm1); /* xmm1 = 44 42 26 23 -- -- -- -- */ \ | |
549 por_r2r(xmm6, xmm0); /* xmm1 = -- -- -- -- 13 10 04 01 */ \ | |
550 /* 12345 in use */ \ | |
551 por_r2r(xmm0, xmm1); /* xmm1 = 44 42 26 23 13 10 04 01 */ \ | |
552 pshuflw_r2r(xmm4, xmm4, 0x093); /* xmm4 = 27 -- -- -- 22 21 20 -- */ \ | |
553 \ | |
554 pshufhw_r2r(xmm4, xmm4, 0x093); /* xmm4 = -- -- -- 27 22 21 20 -- */ \ | |
555 movdqu_r2m(xmm1, *(eax + 16)); /* write 44 42 26 23 13 10 04 01 */ \ | |
556 \ | |
557 pshufhw_r2r(xmm3, xmm3, 0x0D2); /* xmm3 = 07 05 -- 06 -- -- -- -- */ \ | |
558 movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ | |
559 \ | |
560 pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 06 -- -- -- -- */ \ | |
561 psrldq_i2r(12, xmm3); /* xmm3 = -- -- -- -- -- -- 07 05 */ \ | |
562 \ | |
563 psrldq_i2r(8, xmm0); /* xmm0 = -- -- -- -- -- -- -- 06 */ \ | |
564 \ | |
565 movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
566 movdqu_m2r(*(ecx + 96), xmm7); /* xmm7 = -- -- -- -- FF FF -- -- */ \ | |
567 \ | |
568 pand_r2r(xmm4, xmm6); /* xmm6 = -- -- -- 27 22 -- -- -- */ \ | |
569 pxor_r2r(xmm6, xmm4); /* xmm4 = -- -- -- -- -- 21 20 -- */ \ | |
570 \ | |
571 por_r2r(xmm6, xmm3); /* xmm3 = -- -- -- 27 22 -- 07 05 */ \ | |
572 pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- -- -- 21 -- -- */ \ | |
573 \ | |
574 por_r2r(xmm7, xmm0); /* xmm0 = -- -- -- -- -- 21 -- 06 */ \ | |
575 pxor_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- -- 20 -- */ \ | |
576 \ | |
577 movdqu_m2r(*(ecx + 16 ), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \ | |
578 movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ | |
579 \ | |
580 pand_r2r(xmm2, xmm6); /* xmm6 = -- -- 15 14 -- -- -- -- */ \ | |
581 pand_r2r(xmm6, xmm1); /* xmm1 = -- -- -- 14 -- -- -- -- */ \ | |
582 \ | |
583 pxor_r2r(xmm6, xmm2); /* xmm2 = 17 16 -- -- -- -- -- -- */ \ | |
584 pxor_r2r(xmm1, xmm6); /* xmm6 = -- -- 15 -- -- -- -- -- */ \ | |
585 \ | |
586 psrldq_i2r(4, xmm1); /* xmm1 = -- -- -- -- -- 14 -- -- */ \ | |
587 \ | |
588 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- 15 -- */ \ | |
589 por_r2r(xmm1, xmm3); /* xmm3 = -- -- -- 27 22 14 07 05 */ \ | |
590 \ | |
591 por_r2r(xmm6, xmm0); /* xmm0 = -- -- -- -- -- 21 15 06 */ \ | |
592 pshufhw_r2r(xmm5, xmm5, 0x0E1); /* xmm5 = 47 46 -- 45 -- -- 41 40 */ \ | |
593 \ | |
594 movdqu_m2r(*(ecx + 64), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ | |
595 pshuflw_r2r(xmm5, xmm5, 0x072); /* xmm5 = 47 46 -- 45 41 -- 40 -- */ \ | |
596 \ | |
597 movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
598 pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 45 41 -- -- -- */ \ | |
599 \ | |
600 pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 -- -- -- -- 40 -- */ \ | |
601 pslldq_i2r(4, xmm1); /* xmm1 = -- 45 41 -- -- -- -- -- */ \ | |
602 \ | |
603 pshufd_r2r(xmm5, xmm5, 0x09C); /* xmm5 = -- -- -- -- 47 46 40 -- */ \ | |
604 por_r2r(xmm1, xmm3); /* xmm3 = -- 45 41 27 22 14 07 05 */ \ | |
605 \ | |
606 movdqu_m2r(*(eax + 96), xmm1); /* xmm1 = 67 66 65 64 63 62 61 60 */ \ | |
607 pmullw_m2r(*(ebx + 96), xmm1); \ | |
608 \ | |
609 movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \ | |
610 \ | |
611 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ | |
612 pand_r2r(xmm5, xmm7); /* xmm7 = -- -- -- -- -- 46 40 -- */ \ | |
613 \ | |
614 pand_r2r(xmm1, xmm6); /* xmm6 = -- -- -- -- -- -- -- 60 */ \ | |
615 pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- -- -- 47 -- -- -- */ \ | |
616 \ | |
617 pxor_r2r(xmm6, xmm1); /* xmm1 = 67 66 65 64 63 62 61 -- */ \ | |
618 pslldq_i2r(2, xmm5); /* xmm5 = -- -- -- 47 -- -- -- -- */ \ | |
619 \ | |
620 pslldq_i2r(14, xmm6); /* xmm6 = 60 -- -- -- -- -- -- -- */ \ | |
621 por_r2r(xmm5, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 -- */ \ | |
622 \ | |
623 por_r2r(xmm6, xmm3); /* xmm3 = 60 45 41 27 22 14 07 05 */ \ | |
624 pslldq_i2r(6, xmm7); /* xmm7 = -- -- 46 40 -- -- -- -- */ \ | |
625 \ | |
626 movdqu_r2m(xmm3, *(eax+32)); /* write 60 45 41 27 22 14 07 05 */ \ | |
627 por_r2r(xmm7, xmm0); /* xmm0 = -- -- 46 40 -- 21 15 06 */ \ | |
628 /* 0, 1, 2, 4 in use */ \ | |
629 movdqu_m2r(*(eax + 48), xmm3); /* xmm3 = 37 36 35 34 33 32 31 30 */ \ | |
630 movdqu_m2r(*(eax + 80), xmm5); /* xmm5 = 57 56 55 54 53 52 51 50 */ \ | |
631 \ | |
632 pmullw_m2r(*(ebx + 48), xmm3); \ | |
633 pmullw_m2r(*(ebx + 80), xmm5); \ | |
634 \ | |
635 movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
636 movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ | |
637 \ | |
638 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ | |
639 pslldq_i2r(8, xmm7); /* xmm7 = FF -- -- -- -- -- -- -- */ \ | |
640 \ | |
641 pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- -- 30 */ \ | |
642 pand_r2r(xmm5, xmm7); /* xmm7 = 57 -- -- -- -- -- -- -- */ \ | |
643 \ | |
644 pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 31 -- */ \ | |
645 pxor_r2r(xmm7, xmm5); /* xmm5 = __ 56 55 54 53 52 51 50 */ \ | |
646 \ | |
647 pslldq_i2r(6, xmm6); /* xmm6 = -- -- -- -- 30 -- -- -- */ \ | |
648 psrldq_i2r(2, xmm7); /* xmm7 = -- 57 -- -- -- -- -- -- */ \ | |
649 \ | |
650 por_r2r(xmm7, xmm6); /* xmm6 = -- 57 -- -- 30 -- -- -- */ \ | |
651 movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \ | |
652 \ | |
653 por_r2r(xmm6, xmm0); /* xmm0 = -- 57 46 40 30 21 15 06 */ \ | |
654 psrldq_i2r(2, xmm7); /* xmm7 = -- -- -- -- -- -- FF FF */ \ | |
655 \ | |
656 movdqu_r2r(xmm2, xmm6); /* xmm6 = 17 16 -- -- -- -- -- -- */ \ | |
657 pand_r2r(xmm1, xmm7); /* xmm7 = -- -- -- -- -- -- 61 -- */ \ | |
658 \ | |
659 pslldq_i2r(2, xmm6); /* xmm6 = 16 -- -- -- -- -- -- -- */ \ | |
660 psrldq_i2r(14, xmm2); /* xmm2 = -- -- -- -- -- -- -- 17 */ \ | |
661 \ | |
662 pxor_r2r(xmm7, xmm1); /* xmm1 = 67 66 65 64 63 62 -- -- */ \ | |
663 pslldq_i2r(12, xmm7); /* xmm7 = 61 -- -- -- -- -- -- -- */ \ | |
664 \ | |
665 psrldq_i2r(14, xmm6); /* xmm6 = -- -- -- -- -- -- -- 16 */ \ | |
666 por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 16 */ \ | |
667 \ | |
668 por_r2r(xmm7, xmm0); /* xmm0 = 61 57 46 40 30 21 15 06 */ \ | |
669 movdqu_m2r(*(ecx), xmm6); /* xmm6 = -- -- -- -- -- FF FF -- */ \ | |
670 \ | |
671 psrldq_i2r(2, xmm6); /* xmm6 = -- -- -- -- -- -- FF FF */ \ | |
672 movdqu_r2m(xmm0, *(eax+48)); /* write 61 57 46 40 30 21 15 06 */ \ | |
673 /* 1, 2, 3, 4, 5 in use */\ | |
674 movdqu_m2r(*(ecx), xmm0); /* xmm0 = -- -- -- -- -- FF FF -- */ \ | |
675 pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- 31 -- */ \ | |
676 \ | |
677 movdqu_r2r(xmm3, xmm7); /* xmm7 = 37 36 35 34 33 32 31 -- */ \ | |
678 pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 -- -- */ \ | |
679 \ | |
680 pslldq_i2r(2, xmm3); /* xmm3 = 36 35 34 33 32 -- -- -- */ \ | |
681 pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- -- 62 -- -- */ \ | |
682 \ | |
683 psrldq_i2r(14, xmm7); /* xmm7 = -- -- -- -- -- -- -- 37 */ \ | |
684 pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 63 -- -- -- */ \ | |
685 \ | |
686 por_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- 31 37 */ \ | |
687 movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ | |
688 \ | |
689 pshuflw_r2r(xmm6, xmm6, 0x01E); /* xmm6 = -- -- -- -- 37 31 -- -- */ \ | |
690 pslldq_i2r(6, xmm7); /* xmm7 = FF FF -- -- -- -- -- -- */ \ | |
691 \ | |
692 por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 37 31 20 16 */ \ | |
693 pand_r2r(xmm5, xmm7); /* xmm7 = -- 56 -- -- -- -- -- -- */ \ | |
694 \ | |
695 pslldq_i2r(8, xmm0); /* xmm0 = -- 62 -- -- -- -- -- -- */ \ | |
696 pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- 55 54 53 52 51 50 */ \ | |
697 \ | |
698 psrldq_i2r(2, xmm7); /* xmm7 = -- -- 56 -- -- -- -- -- */ \ | |
699 \ | |
700 pshufhw_r2r(xmm3, xmm3, 0x087); /* xmm3 = 35 33 34 36 32 -- -- -- */ \ | |
701 por_r2r(xmm7, xmm0); /* xmm0 = -- 62 56 -- -- -- -- -- */ \ | |
702 \ | |
703 movdqu_m2r(*(eax + 112), xmm7); /* xmm7 = 77 76 75 74 73 72 71 70 */ \ | |
704 pmullw_m2r(*(ebx + 112), xmm7); \ | |
705 \ | |
706 movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
707 por_r2r(xmm0, xmm4); /* xmm4 = -- 62 56 47 37 31 20 16 */ \ | |
708 \ | |
709 pshuflw_r2r(xmm7, xmm7, 0x0E1); /* xmm7 = 77 76 75 74 73 72 70 71 */ \ | |
710 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ | |
711 \ | |
712 movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ | |
713 pand_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- -- 71 */ \ | |
714 \ | |
715 pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 36 32 -- -- -- */ \ | |
716 pxor_r2r(xmm6, xmm7); /* xmm7 = 77 76 75 74 73 72 70 -- */ \ | |
717 \ | |
718 pxor_r2r(xmm0, xmm3); /* xmm3 = 35 33 34 -- -- -- -- -- */ \ | |
719 pslldq_i2r(14, xmm6); /* xmm6 = 71 -- -- -- -- -- -- -- */ \ | |
720 \ | |
721 psrldq_i2r(4, xmm0); /* xmm0 = -- -- -- -- -- 36 32 -- */ \ | |
722 por_r2r(xmm6, xmm4); /* xmm4 = 71 62 56 47 37 31 20 16 */ \ | |
723 \ | |
724 por_r2r(xmm0, xmm2); /* xmm2 = -- -- -- -- -- 36 32 17 */ \ | |
725 movdqu_r2m(xmm4, *(eax + 64)); /* write 71 62 56 47 37 31 20 16 */ \ | |
726 /* 1, 2, 3, 5, 7 in use */ \ | |
727 movdqu_m2r(*(ecx + 80), xmm6); /* xmm6 = -- -- FF -- -- -- -- FF */ \ | |
728 pshufhw_r2r(xmm7, xmm7, 0x0D2); /* xmm7 = 77 75 74 76 73 72 70 __ */ \ | |
729 \ | |
730 movdqu_m2r(*(ecx), xmm4); /* xmm4 = -- -- -- -- -- FF FF -- */ \ | |
731 movdqu_m2r(*(ecx+48), xmm0); /* xmm0 = -- -- -- -- FF -- -- -- */ \ | |
732 \ | |
733 pand_r2r(xmm5, xmm6); /* xmm6 = -- -- 55 -- -- -- -- 50 */ \ | |
734 pand_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- 72 70 -- */ \ | |
735 \ | |
736 pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- 63 -- -- -- */ \ | |
737 pxor_r2r(xmm6, xmm5); /* xmm5 = -- -- -- 54 53 52 51 -- */ \ | |
738 \ | |
739 pxor_r2r(xmm4, xmm7); /* xmm7 = 77 75 74 76 73 -- -- -- */ \ | |
740 pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 -- -- -- -- */ \ | |
741 \ | |
742 pshuflw_r2r(xmm6, xmm6, 0x02B); /* xmm6 = -- -- 55 -- 50 -- -- -- */ \ | |
743 pslldq_i2r(10, xmm4); /* xmm4 = 72 20 -- -- -- -- -- -- */ \ | |
744 \ | |
745 pshufhw_r2r(xmm6, xmm6, 0x0B1); /* xmm6 = -- -- -- 55 50 -- -- -- */ \ | |
746 pslldq_i2r(4, xmm0); /* xmm0 = -- -- 63 -- -- -- -- -- */ \ | |
747 \ | |
748 por_r2r(xmm4, xmm6); /* xmm6 = 72 70 -- 55 50 -- -- -- */ \ | |
749 por_r2r(xmm0, xmm2); /* xmm2 = -- -- 63 -- -- 36 32 17 */ \ | |
750 \ | |
751 por_r2r(xmm6, xmm2); /* xmm2 = 72 70 64 55 50 36 32 17 */ \ | |
752 pshufhw_r2r(xmm1, xmm1, 0x0C9); /* xmm1 = 67 64 66 65 -- -- -- -- */ \ | |
753 \ | |
754 movdqu_r2r(xmm3, xmm6); /* xmm6 = 35 33 34 -- -- -- -- -- */ \ | |
755 movdqu_r2m(xmm2, *(eax+80)); /* write 72 70 64 55 50 36 32 17 */ \ | |
756 \ | |
757 psrldq_i2r(12, xmm6); /* xmm6 = -- -- -- -- -- -- 35 33 */ \ | |
758 pslldq_i2r(4, xmm3); /* xmm3 = 34 -- -- -- -- -- -- -- */ \ | |
759 \ | |
760 pshuflw_r2r(xmm5, xmm5, 0x04E); /* xmm5 = -- -- -- 54 51 -- 53 52 */ \ | |
761 movdqu_r2r(xmm7, xmm4); /* xmm4 = 77 75 74 76 73 -- -- -- */ \ | |
762 \ | |
763 movdqu_r2r(xmm5, xmm2); /* xmm2 = -- -- -- 54 51 -- 53 52 */ \ | |
764 psrldq_i2r(10, xmm7); /* xmm7 = -- -- -- -- -- 77 75 74 */ \ | |
765 \ | |
766 pslldq_i2r(6, xmm4); /* xmm4 = 76 73 -- -- -- -- -- -- */ \ | |
767 pslldq_i2r(12, xmm2); /* xmm2 = 53 52 -- -- -- -- -- -- */ \ | |
768 \ | |
769 movdqu_r2r(xmm1, xmm0); /* xmm0 = 67 64 66 65 -- -- -- -- */ \ | |
770 psrldq_i2r(12, xmm1); /* xmm1 = -- -- -- -- -- -- 67 64 */ \ | |
771 \ | |
772 psrldq_i2r(6, xmm5); /* xmm5 = -- -- -- -- -- -- 54 51 */ \ | |
773 psrldq_i2r(14, xmm3); /* xmm3 = -- -- -- -- -- -- -- 34 */ \ | |
774 \ | |
775 pslldq_i2r(10, xmm7); /* xmm7 = 77 75 74 -- -- -- -- -- */ \ | |
776 por_r2r(xmm6, xmm4); /* xmm4 = 76 73 -- -- -- -- 35 33 */ \ | |
777 \ | |
778 psrldq_i2r(10, xmm2); /* xmm2 = -- -- -- -- -- 53 52 -- */ \ | |
779 pslldq_i2r(4, xmm0); /* xmm0 = 66 65 -- -- -- -- -- -- */ \ | |
780 \ | |
781 pslldq_i2r(8, xmm1); /* xmm1 = -- -- 67 64 -- -- -- -- */ \ | |
782 por_r2r(xmm7, xmm3); /* xmm3 = 77 75 74 -- -- -- -- 34 */ \ | |
783 \ | |
784 psrldq_i2r(6, xmm0); /* xmm0 = -- -- -- 66 65 -- -- -- */ \ | |
785 pslldq_i2r(4, xmm5); /* xmm5 = -- -- -- -- 54 51 -- -- */ \ | |
786 \ | |
787 por_r2r(xmm1, xmm4); /* xmm4 = 76 73 67 64 -- -- 35 33 */ \ | |
788 por_r2r(xmm2, xmm3); /* xmm3 = 77 75 74 -- -- 53 52 34 */ \ | |
789 \ | |
790 por_r2r(xmm5, xmm4); /* xmm4 = 76 73 67 64 54 51 35 33 */ \ | |
791 por_r2r(xmm0, xmm3); /* xmm3 = 77 75 74 66 65 53 52 34 */ \ | |
792 \ | |
793 movdqu_r2m(xmm4, *(eax+96)); /* write 76 73 67 64 54 51 35 33 */ \ | |
794 movdqu_r2m(xmm3, *(eax+112)); /* write 77 75 74 66 65 53 52 34 */ \ | |
795 \ | |
796 } /* end of SSE2_Dequantize Macro */ | |
797 | |
798 | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1978
diff
changeset
|
799 void ff_vp3_idct_sse2(int16_t *input_data) |
1970 | 800 { |
801 unsigned char *input_bytes = (unsigned char *)input_data; | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1978
diff
changeset
|
802 unsigned char *output_data_bytes = (unsigned char *)input_data; |
1970 | 803 unsigned char *idct_data_bytes = (unsigned char *)SSE2_idct_data; |
804 unsigned char *Eight = (unsigned char *)eight_data; | |
805 | |
806 #define eax input_bytes | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1978
diff
changeset
|
807 //#define ebx dequant_matrix_bytes |
1970 | 808 #define ecx dequant_const_bytes |
809 #define edx idct_data_bytes | |
810 | |
811 #define I(i) (eax + 16 * i) | |
812 #define O(i) (ebx + 16 * i) | |
813 #define C(i) (edx + 16 * (i-1)) | |
814 | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1978
diff
changeset
|
815 // SSE2_Dequantize(); |
1970 | 816 |
817 #undef ebx | |
818 #define ebx output_data_bytes | |
819 | |
820 SSE2_Row_IDCT(); | |
821 | |
822 SSE2_Transpose(); | |
823 | |
824 SSE2_Column_IDCT(); | |
825 } |