Mercurial > libavcodec.hg
annotate i386/vp3dsp_sse2.c @ 2892:41315d0120b3 libavcodec
replace a few mov + psrlq with pshufw, there are more cases which could benefit from this but they would require us to duplicate some functions ...
the trick is from various places (my own code in libpostproc, a patch on the x264 list, ...)
author | michael |
---|---|
date | Wed, 21 Sep 2005 21:17:09 +0000 |
parents | fd5d7c732c6b |
children | ef2149182f1c |
rev | line source |
---|---|
1970 | 1 /* |
2 * Copyright (C) 2004 the ffmpeg project | |
3 * | |
4 * This library is free software; you can redistribute it and/or | |
5 * modify it under the terms of the GNU Lesser General Public | |
6 * License as published by the Free Software Foundation; either | |
7 * version 2 of the License, or (at your option) any later version. | |
8 * | |
9 * This library is distributed in the hope that it will be useful, | |
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 * Lesser General Public License for more details. | |
13 * | |
14 * You should have received a copy of the GNU Lesser General Public | |
15 * License along with this library; if not, write to the Free Software | |
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
17 */ | |
18 | |
19 /** | |
20 * @file vp3dsp_sse2.c | |
21 * SSE2-optimized functions cribbed from the original VP3 source code. | |
22 */ | |
23 | |
24 #include "../dsputil.h" | |
25 #include "mmx.h" | |
26 | |
2753 | 27 static const unsigned short __align16 SSE2_dequant_const[] = |
1970 | 28 { |
29 0,65535,65535,0,0,0,0,0, // 0x0000 0000 0000 0000 0000 FFFF FFFF 0000 | |
30 0,0,0,0,65535,65535,0,0, // 0x0000 0000 FFFF FFFF 0000 0000 0000 0000 | |
31 65535,65535,65535,0,0,0,0,0,// 0x0000 0000 0000 0000 0000 FFFF FFFF FFFF | |
32 0,0,0,65535,0,0,0,0, // 0x0000 0000 0000 0000 FFFF 0000 0000 0000 | |
33 0,0,0,65535,65535,0,0,0, // 0x0000 0000 0000 FFFF FFFF 0000 0000 0000 | |
34 65535,0,0,0,0,65535,0,0, // 0x0000 0000 FFFF 0000 0000 0000 0000 FFFF | |
35 0,0,65535,65535, 0,0,0,0 // 0x0000 0000 0000 0000 FFFF FFFF 0000 0000 | |
36 }; | |
37 | |
2753 | 38 static const unsigned int __align16 eight_data[] = |
1970 | 39 { |
40 0x00080008, | |
41 0x00080008, | |
42 0x00080008, | |
43 0x00080008 | |
44 }; | |
45 | |
2753 | 46 static const unsigned short __align16 SSE2_idct_data[7 * 8] = |
1970 | 47 { |
48 64277,64277,64277,64277,64277,64277,64277,64277, | |
49 60547,60547,60547,60547,60547,60547,60547,60547, | |
50 54491,54491,54491,54491,54491,54491,54491,54491, | |
51 46341,46341,46341,46341,46341,46341,46341,46341, | |
52 36410,36410,36410,36410,36410,36410,36410,36410, | |
53 25080,25080,25080,25080,25080,25080,25080,25080, | |
54 12785,12785,12785,12785,12785,12785,12785,12785 | |
55 }; | |
56 | |
57 | |
58 #define SSE2_Column_IDCT() { \ | |
59 \ | |
60 movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \ | |
61 movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \ | |
62 \ | |
63 movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \ | |
64 movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \ | |
65 \ | |
66 pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \ | |
67 movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \ | |
68 \ | |
69 pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \ | |
70 movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \ | |
71 \ | |
72 pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \ | |
73 movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \ | |
74 \ | |
75 pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \ | |
76 movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \ | |
77 \ | |
78 /* all registers are in use */ \ | |
79 \ | |
80 paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \ | |
81 paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \ | |
82 \ | |
83 paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \ | |
84 movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \ | |
85 \ | |
86 paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \ | |
87 movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \ | |
88 \ | |
89 pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \ | |
90 paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \ | |
91 \ | |
92 pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \ | |
93 movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \ | |
94 \ | |
95 psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \ | |
96 paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \ | |
97 \ | |
98 pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \ | |
99 movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \ | |
100 \ | |
101 pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \ | |
102 paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \ | |
103 \ | |
104 movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \ | |
105 pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \ | |
106 \ | |
107 psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \ | |
108 movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \ | |
109 \ | |
110 paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \ | |
111 movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \ | |
112 \ | |
113 psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \ | |
114 pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \ | |
115 \ | |
116 paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \ | |
117 pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \ | |
118 \ | |
119 paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \ | |
120 paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \ | |
121 \ | |
122 psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \ | |
123 paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \ | |
124 \ | |
125 paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \ | |
126 pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \ | |
127 \ | |
128 paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \ | |
129 movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \ | |
130 \ | |
131 psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \ | |
132 movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \ | |
133 \ | |
134 movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \ | |
135 pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ | |
136 \ | |
137 paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \ | |
138 movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \ | |
139 \ | |
140 movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \ | |
141 movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \ | |
142 \ | |
143 pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ | |
144 paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \ | |
145 \ | |
146 movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \ | |
147 psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \ | |
148 \ | |
149 paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \ | |
150 psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \ | |
151 \ | |
152 movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \ | |
153 pmulhw_r2r(xmm4, xmm6); /* xmm6 = (c4 - 1) * (i0 - i4) = F */ \ | |
154 \ | |
155 paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \ | |
156 paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \ | |
157 \ | |
158 paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \ | |
159 paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \ | |
160 \ | |
161 pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ | |
162 paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \ | |
163 \ | |
164 psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \ | |
165 paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \ | |
166 \ | |
167 movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \ | |
168 paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \ | |
169 \ | |
170 paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ | |
171 psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \ | |
172 \ | |
173 paddsw_m2r(*Eight, xmm2); /* Adjust R2 and R1 before shifting */ \ | |
174 paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \ | |
175 \ | |
176 paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \ | |
177 psraw_i2r(4, xmm2); /* xmm2 = op2 */ \ | |
178 \ | |
179 psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \ | |
180 psraw_i2r(4, xmm1); /* xmm1 = op1 */ \ | |
181 \ | |
182 movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \ | |
183 paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \ | |
184 \ | |
185 movdqu_r2m(xmm2, *O(2)); /* Write out op2 */ \ | |
186 paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \ | |
187 \ | |
188 movdqu_r2m(xmm1, *O(1)); /* Write out op1 */ \ | |
189 psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \ | |
190 \ | |
191 paddsw_m2r(*Eight, xmm4); /* Adjust R4 and R3 before shifting */ \ | |
192 paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \ | |
193 \ | |
194 paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \ | |
195 psraw_i2r(4, xmm4); /* xmm4 = op4 */ \ | |
196 \ | |
197 psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \ | |
198 psraw_i2r(4, xmm3); /* xmm3 = op3 */ \ | |
199 \ | |
200 paddsw_m2r(*Eight, xmm6); /* Adjust R6 and R5 before shifting */ \ | |
201 paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \ | |
202 \ | |
203 paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \ | |
204 psraw_i2r(4, xmm6); /* xmm6 = op6 */ \ | |
205 \ | |
206 movdqu_r2m(xmm4, *O(4)); /* Write out op4 */ \ | |
207 psraw_i2r(4, xmm5); /* xmm5 = op5 */ \ | |
208 \ | |
209 movdqu_r2m(xmm3, *O(3)); /* Write out op3 */ \ | |
210 psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \ | |
211 \ | |
212 paddsw_m2r(*Eight, xmm7); /* Adjust R7 and R0 before shifting */ \ | |
213 paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \ | |
214 \ | |
215 paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \ | |
216 psraw_i2r(4, xmm7); /* xmm7 = op7 */ \ | |
217 \ | |
218 movdqu_r2m(xmm6, *O(6)); /* Write out op6 */ \ | |
219 psraw_i2r(4, xmm0); /* xmm0 = op0 */ \ | |
220 \ | |
221 movdqu_r2m(xmm5, *O(5)); /* Write out op5 */ \ | |
222 movdqu_r2m(xmm7, *O(7)); /* Write out op7 */ \ | |
223 \ | |
224 movdqu_r2m(xmm0, *O(0)); /* Write out op0 */ \ | |
225 \ | |
226 } /* End of SSE2_Column_IDCT macro */ | |
227 | |
228 | |
229 #define SSE2_Row_IDCT() { \ | |
230 \ | |
231 movdqu_m2r(*I(3), xmm2); /* xmm2 = i3 */ \ | |
232 movdqu_m2r(*C(3), xmm6); /* xmm6 = c3 */ \ | |
233 \ | |
234 movdqu_r2r(xmm2, xmm4); /* xmm4 = i3 */ \ | |
235 movdqu_m2r(*I(5), xmm7); /* xmm7 = i5 */ \ | |
236 \ | |
237 pmulhw_r2r(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */ \ | |
238 movdqu_m2r(*C(5), xmm1); /* xmm1 = c5 */ \ | |
239 \ | |
240 pmulhw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */ \ | |
241 movdqu_r2r(xmm1, xmm5); /* xmm5 = c5 */ \ | |
242 \ | |
243 pmulhw_r2r(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */ \ | |
244 movdqu_m2r(*I(1), xmm3); /* xmm3 = i1 */ \ | |
245 \ | |
246 pmulhw_r2r(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */ \ | |
247 movdqu_m2r(*C(1), xmm0); /* xmm0 = c1 */ \ | |
248 \ | |
249 /* all registers are in use */ \ | |
250 \ | |
251 paddw_r2r(xmm2, xmm4); /* xmm4 = c3 * i3 */ \ | |
252 paddw_r2r(xmm7, xmm6); /* xmm6 = c3 * i5 */ \ | |
253 \ | |
254 paddw_r2r(xmm1, xmm2); /* xmm2 = c5 * i3 */ \ | |
255 movdqu_m2r(*I(7), xmm1); /* xmm1 = i7 */ \ | |
256 \ | |
257 paddw_r2r(xmm5, xmm7); /* xmm7 = c5 * i5 */ \ | |
258 movdqu_r2r(xmm0, xmm5); /* xmm5 = c1 */ \ | |
259 \ | |
260 pmulhw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */ \ | |
261 paddsw_r2r(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */ \ | |
262 \ | |
263 pmulhw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */ \ | |
264 movdqu_m2r(*C(7), xmm7); /* xmm7 = c7 */ \ | |
265 \ | |
266 psubsw_r2r(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */ \ | |
267 paddw_r2r(xmm3, xmm0); /* xmm0 = c1 * i1 */ \ | |
268 \ | |
269 pmulhw_r2r(xmm7, xmm3); /* xmm3 = c7 * i1 */ \ | |
270 movdqu_m2r(*I(2), xmm2); /* xmm2 = i2 */ \ | |
271 \ | |
272 pmulhw_r2r(xmm1, xmm7); /* xmm7 = c7 * i7 */ \ | |
273 paddw_r2r(xmm1, xmm5); /* xmm5 = c1 * i7 */ \ | |
274 \ | |
275 movdqu_r2r(xmm2, xmm1); /* xmm1 = i2 */ \ | |
276 pmulhw_m2r(*C(2), xmm2); /* xmm2 = i2 * c2 -i2 */ \ | |
277 \ | |
278 psubsw_r2r(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */ \ | |
279 movdqu_m2r(*I(6), xmm5); /* xmm5 = i6 */ \ | |
280 \ | |
281 paddsw_r2r(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */ \ | |
282 movdqu_r2r(xmm5, xmm7); /* xmm7 = i6 */ \ | |
283 \ | |
284 psubsw_r2r(xmm4, xmm0); /* xmm0 = A - C */ \ | |
285 pmulhw_m2r(*C(2), xmm5); /* xmm5 = c2 * i6 - i6 */ \ | |
286 \ | |
287 paddw_r2r(xmm1, xmm2); /* xmm2 = i2 * c2 */ \ | |
288 pmulhw_m2r(*C(6), xmm1); /* xmm1 = c6 * i2 */ \ | |
289 \ | |
290 paddsw_r2r(xmm4, xmm4); /* xmm4 = C + C */ \ | |
291 paddsw_r2r(xmm0, xmm4); /* xmm4 = A + C = C. */ \ | |
292 \ | |
293 psubsw_r2r(xmm6, xmm3); /* xmm3 = B - D */ \ | |
294 paddw_r2r(xmm7, xmm5); /* xmm5 = c2 * i6 */ \ | |
295 \ | |
296 paddsw_r2r(xmm6, xmm6); /* xmm6 = D + D */ \ | |
297 pmulhw_m2r(*C(6), xmm7); /* xmm7 = c6 * i6 */ \ | |
298 \ | |
299 paddsw_r2r(xmm3, xmm6); /* xmm6 = B + D = D. */ \ | |
300 movdqu_r2m(xmm4, *I(1)); /* Save C. at I(1) */ \ | |
301 \ | |
302 psubsw_r2r(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */ \ | |
303 movdqu_m2r(*C(4), xmm4); /* xmm4 = c4 */ \ | |
304 \ | |
305 movdqu_r2r(xmm3, xmm5); /* xmm5 = B - D */ \ | |
306 pmulhw_r2r(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */ \ | |
307 \ | |
308 paddsw_r2r(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */ \ | |
309 movdqu_r2m(xmm6, *I(2)); /* Save D. at I(2) */ \ | |
310 \ | |
311 movdqu_r2r(xmm0, xmm2); /* xmm2 = A - C */ \ | |
312 movdqu_m2r(*I(0), xmm6); /* xmm6 = i0 */ \ | |
313 \ | |
314 pmulhw_r2r(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */ \ | |
315 paddw_r2r(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */ \ | |
316 \ | |
317 movdqu_m2r(*I(4), xmm3); /* xmm3 = i4 */ \ | |
318 psubsw_r2r(xmm1, xmm5); /* xmm5 = B. - H = B.. */ \ | |
319 \ | |
320 paddw_r2r(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */ \ | |
321 psubsw_r2r(xmm3, xmm6); /* xmm6 = i0 - i4 */ \ | |
322 \ | |
323 movdqu_r2r(xmm6, xmm0); /* xmm0 = i0 - i4 */ \ | |
324 pmulhw_r2r(xmm4, xmm6); /* xmm6 = ( c4 - 1 ) * ( i0 - i4 ) = F */ \ | |
325 \ | |
326 paddsw_r2r(xmm3, xmm3); /* xmm3 = i4 + i4 */ \ | |
327 paddsw_r2r(xmm1, xmm1); /* xmm1 = H + H */ \ | |
328 \ | |
329 paddsw_r2r(xmm0, xmm3); /* xmm3 = i0 + i4 */ \ | |
330 paddsw_r2r(xmm5, xmm1); /* xmm1 = B. + H = H. */ \ | |
331 \ | |
332 pmulhw_r2r(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */ \ | |
333 paddw_r2r(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */ \ | |
334 \ | |
335 psubsw_r2r(xmm2, xmm6); /* xmm6 = F - A. = F. */ \ | |
336 paddsw_r2r(xmm2, xmm2); /* xmm2 = A. + A. */ \ | |
337 \ | |
338 movdqu_m2r(*I(1), xmm0); /* Load C. from I(1) */ \ | |
339 paddsw_r2r(xmm6, xmm2); /* xmm2 = F + A. = A.. */ \ | |
340 \ | |
341 paddw_r2r(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */ \ | |
342 psubsw_r2r(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */ \ | |
343 \ | |
344 paddsw_r2r(xmm1, xmm1); /* xmm1 = H. + H. */ \ | |
345 paddsw_r2r(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */ \ | |
346 \ | |
347 psubsw_r2r(xmm7, xmm4); /* xmm4 = E - G = E. */ \ | |
348 \ | |
349 movdqu_m2r(*I(2), xmm3); /* Load D. from I(2) */ \ | |
350 paddsw_r2r(xmm7, xmm7); /* xmm7 = G + G */ \ | |
351 \ | |
352 movdqu_r2m(xmm2, *I(2)); /* Write out op2 */ \ | |
353 paddsw_r2r(xmm4, xmm7); /* xmm7 = E + G = G. */ \ | |
354 \ | |
355 movdqu_r2m(xmm1, *I(1)); /* Write out op1 */ \ | |
356 psubsw_r2r(xmm3, xmm4); /* xmm4 = E. - D. = R4 */ \ | |
357 \ | |
358 paddsw_r2r(xmm3, xmm3); /* xmm3 = D. + D. */ \ | |
359 \ | |
360 paddsw_r2r(xmm4, xmm3); /* xmm3 = E. + D. = R3 */ \ | |
361 \ | |
362 psubsw_r2r(xmm5, xmm6); /* xmm6 = F. - B..= R6 */ \ | |
363 \ | |
364 paddsw_r2r(xmm5, xmm5); /* xmm5 = B.. + B.. */ \ | |
365 \ | |
366 paddsw_r2r(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */ \ | |
367 \ | |
368 movdqu_r2m(xmm4, *I(4)); /* Write out op4 */ \ | |
369 \ | |
370 movdqu_r2m(xmm3, *I(3)); /* Write out op3 */ \ | |
371 psubsw_r2r(xmm0, xmm7); /* xmm7 = G. - C. = R7 */ \ | |
372 \ | |
373 paddsw_r2r(xmm0, xmm0); /* xmm0 = C. + C. */ \ | |
374 \ | |
375 paddsw_r2r(xmm7, xmm0); /* xmm0 = G. + C. */ \ | |
376 \ | |
377 movdqu_r2m(xmm6, *I(6)); /* Write out op6 */ \ | |
378 \ | |
379 movdqu_r2m(xmm5, *I(5)); /* Write out op5 */ \ | |
380 movdqu_r2m(xmm7, *I(7)); /* Write out op7 */ \ | |
381 \ | |
382 movdqu_r2m(xmm0, *I(0)); /* Write out op0 */ \ | |
383 \ | |
384 } /* End of SSE2_Row_IDCT macro */ | |
385 | |
386 | |
387 #define SSE2_Transpose() { \ | |
388 \ | |
389 movdqu_m2r(*I(4), xmm4); /* xmm4=e7e6e5e4e3e2e1e0 */ \ | |
390 movdqu_m2r(*I(5), xmm0); /* xmm4=f7f6f5f4f3f2f1f0 */ \ | |
391 \ | |
392 movdqu_r2r(xmm4, xmm5); /* make a copy */ \ | |
393 punpcklwd_r2r(xmm0, xmm4); /* xmm4=f3e3f2e2f1e1f0e0 */ \ | |
394 \ | |
395 punpckhwd_r2r(xmm0, xmm5); /* xmm5=f7e7f6e6f5e5f4e4 */ \ | |
396 movdqu_m2r(*I(6), xmm6); /* xmm6=g7g6g5g4g3g2g1g0 */ \ | |
397 \ | |
398 movdqu_m2r(*I(7), xmm0); /* xmm0=h7h6h5h4h3h2h1h0 */ \ | |
399 movdqu_r2r(xmm6, xmm7); /* make a copy */ \ | |
400 \ | |
401 punpcklwd_r2r(xmm0, xmm6); /* xmm6=h3g3h3g2h1g1h0g0 */ \ | |
402 punpckhwd_r2r(xmm0, xmm7); /* xmm7=h7g7h6g6h5g5h4g4 */ \ | |
403 \ | |
404 movdqu_r2r(xmm4, xmm3); /* make a copy */ \ | |
405 punpckldq_r2r(xmm6, xmm4); /* xmm4=h1g1f1e1h0g0f0e0 */ \ | |
406 \ | |
407 punpckhdq_r2r(xmm6, xmm3); /* xmm3=h3g3g3e3h2g2f2e2 */ \ | |
408 movdqu_r2m(xmm3, *I(6)); /* save h3g3g3e3h2g2f2e2 */ \ | |
409 /* Free xmm6 */ \ | |
410 movdqu_r2r(xmm5, xmm6); /* make a copy */ \ | |
411 punpckldq_r2r(xmm7, xmm5); /* xmm5=h5g5f5e5h4g4f4e4 */ \ | |
412 \ | |
413 punpckhdq_r2r(xmm7, xmm6); /* xmm6=h7g7f7e7h6g6f6e6 */ \ | |
414 movdqu_m2r(*I(0), xmm0); /* xmm0=a7a6a5a4a3a2a1a0 */ \ | |
415 /* Free xmm7 */ \ | |
416 movdqu_m2r(*I(1), xmm1); /* xmm1=b7b6b5b4b3b2b1b0 */ \ | |
417 movdqu_r2r(xmm0, xmm7); /* make a copy */ \ | |
418 \ | |
419 punpcklwd_r2r(xmm1, xmm0); /* xmm0=b3a3b2a2b1a1b0a0 */ \ | |
420 punpckhwd_r2r(xmm1, xmm7); /* xmm7=b7a7b6a6b5a5b4a4 */ \ | |
421 /* Free xmm1 */ \ | |
422 movdqu_m2r(*I(2), xmm2); /* xmm2=c7c6c5c4c3c2c1c0 */ \ | |
423 movdqu_m2r(*I(3), xmm3); /* xmm3=d7d6d5d4d3d2d1d0 */ \ | |
424 \ | |
425 movdqu_r2r(xmm2, xmm1); /* make a copy */ \ | |
426 punpcklwd_r2r(xmm3, xmm2); /* xmm2=d3c3d2c2d1c1d0c0 */ \ | |
427 \ | |
428 punpckhwd_r2r(xmm3, xmm1); /* xmm1=d7c7d6c6d5c5d4c4 */ \ | |
429 movdqu_r2r(xmm0, xmm3); /* make a copy */ \ | |
430 \ | |
431 punpckldq_r2r(xmm2, xmm0); /* xmm0=d1c1b1a1d0c0b0a0 */ \ | |
432 punpckhdq_r2r(xmm2, xmm3); /* xmm3=d3c3b3a3d2c2b2a2 */ \ | |
433 /* Free xmm2 */ \ | |
434 movdqu_r2r(xmm7, xmm2); /* make a copy */ \ | |
435 punpckldq_r2r(xmm1, xmm2); /* xmm2=d5c5b5a5d4c4b4a4 */ \ | |
436 \ | |
437 punpckhdq_r2r(xmm1, xmm7); /* xmm7=d7c7b7a7d6c6b6a6 */ \ | |
438 movdqu_r2r(xmm0, xmm1); /* make a copy */ \ | |
439 \ | |
440 punpcklqdq_r2r(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */ \ | |
441 punpckhqdq_r2r(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */ \ | |
442 \ | |
443 movdqu_r2m(xmm0, *I(0)); /* save I(0) */ \ | |
444 movdqu_r2m(xmm1, *I(1)); /* save I(1) */ \ | |
445 \ | |
446 movdqu_m2r(*I(6), xmm0); /* load h3g3g3e3h2g2f2e2 */ \ | |
447 movdqu_r2r(xmm3, xmm1); /* make a copy */ \ | |
448 \ | |
449 punpcklqdq_r2r(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */ \ | |
450 punpckhqdq_r2r(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */ \ | |
451 \ | |
452 movdqu_r2r(xmm2, xmm4); /* make a copy */ \ | |
453 punpcklqdq_r2r(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */ \ | |
454 \ | |
455 punpckhqdq_r2r(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */ \ | |
456 movdqu_r2m(xmm1, *I(2)); /* save I(2) */ \ | |
457 \ | |
458 movdqu_r2m(xmm3, *I(3)); /* save I(3) */ \ | |
459 movdqu_r2m(xmm4, *I(4)); /* save I(4) */ \ | |
460 \ | |
461 movdqu_r2m(xmm2, *I(5)); /* save I(5) */ \ | |
462 movdqu_r2r(xmm7, xmm5); /* make a copy */ \ | |
463 \ | |
464 punpcklqdq_r2r(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */ \ | |
465 punpckhqdq_r2r(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */ \ | |
466 \ | |
467 movdqu_r2m(xmm5, *I(6)); /* save I(6) */ \ | |
468 movdqu_r2m(xmm7, *I(7)); /* save I(7) */ \ | |
469 \ | |
470 } /* End of Transpose Macro */ | |
471 | |
472 | |
473 #define SSE2_Dequantize() { \ | |
474 movdqu_m2r(*(eax), xmm0); \ | |
475 \ | |
476 pmullw_m2r(*(ebx), xmm0); /* xmm0 = 07 06 05 04 03 02 01 00 */ \ | |
477 movdqu_m2r(*(eax + 16), xmm1); \ | |
478 \ | |
479 pmullw_m2r(*(ebx + 16), xmm1); /* xmm1 = 17 16 15 14 13 12 11 10 */ \ | |
480 pshuflw_r2r(xmm0, xmm3, 0x078); /* xmm3 = 07 06 05 04 01 03 02 00 */ \ | |
481 \ | |
482 movdqu_r2r(xmm1, xmm2); /* xmm2 = 17 16 15 14 13 12 11 10 */ \ | |
483 movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \ | |
484 \ | |
485 movdqu_m2r(*(eax + 32), xmm4); \ | |
486 movdqu_m2r(*(eax + 64), xmm5); \ | |
487 \ | |
488 pmullw_m2r(*(ebx + 32), xmm4); /* xmm4 = 27 26 25 24 23 22 21 20 */ \ | |
489 pmullw_m2r(*(ebx + 64), xmm5); /* xmm5 = 47 46 45 44 43 42 41 40 */ \ | |
490 \ | |
491 movdqu_m2r(*(ecx + 16), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \ | |
492 pand_r2r(xmm2, xmm7); /* xmm7 = -- -- -- -- -- 12 11 -- */ \ | |
493 \ | |
494 pand_r2r(xmm4, xmm6); /* xmm6 = -- -- 25 24 -- -- -- -- */ \ | |
495 pxor_r2r(xmm7, xmm2); /* xmm2 = 17 16 15 14 13 -- -- 10 */ \ | |
496 \ | |
497 pxor_r2r(xmm6, xmm4); /* xmm4 = 27 26 -- -- 23 22 21 20 */ \ | |
498 pslldq_i2r(4, xmm7); /* xmm7 = -- -- -- 12 11 -- -- -- */ \ | |
499 \ | |
500 pslldq_i2r(2, xmm6); /* xmm6 = -- 25 24 -- -- -- -- -- */ \ | |
501 por_r2r(xmm6, xmm7); /* xmm7 = -- 25 24 12 11 -- -- -- */ \ | |
502 \ | |
503 movdqu_m2r(*(ecx + 32), xmm0); /* xmm0 = -- -- -- -- -- FF FF FF */ \ | |
504 movdqu_m2r(*(ecx + 48), xmm6); /* xmm6 = -- -- -- -- FF -- -- -- */ \ | |
505 \ | |
506 pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- -- -- 03 02 00 */ \ | |
507 pand_r2r(xmm5, xmm6); /* xmm6 = -- -- -- -- 43 -- -- -- */ \ | |
508 \ | |
509 pxor_r2r(xmm0, xmm3); /* xmm3 = 07 06 05 04 01 -- -- -- */ \ | |
510 pxor_r2r(xmm6, xmm5); /* xmm5 = 47 46 45 44 -- 42 41 40 */ \ | |
511 \ | |
512 por_r2r(xmm7, xmm0); /* xmm0 = -- 25 24 12 11 03 02 00 */ \ | |
513 pslldq_i2r(8, xmm6); /* xmm6 = 43 -- -- -- -- -- -- -- */ \ | |
514 \ | |
515 por_r2r(xmm6, xmm0); /* xmm0 = 43 25 24 12 11 03 02 00 */ \ | |
516 /* 02345 in use */ \ | |
517 \ | |
518 movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ | |
519 pshuflw_r2r(xmm5, xmm5, 0x0B4); /* xmm5 = 47 46 45 44 42 -- 41 40 */ \ | |
520 \ | |
521 movdqu_r2r(xmm1, xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ | |
522 movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
523 \ | |
524 movdqu_r2m(xmm0, *(eax)); /* write 43 25 24 12 11 03 02 00 */ \ | |
525 pshufhw_r2r(xmm4, xmm4, 0x0C2); /* xmm4 = 27 -- -- 26 23 22 21 20 */ \ | |
526 \ | |
527 pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- 26 23 -- -- -- */ \ | |
528 pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 44 42 -- -- -- */ \ | |
529 \ | |
530 pxor_r2r(xmm7, xmm4); /* xmm4 = 27 -- -- -- -- 22 21 20 */ \ | |
531 pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 45 -- -- -- 41 40 */ \ | |
532 \ | |
533 pshuflw_r2r(xmm2, xmm2, 0x0C6); /* xmm2 = 17 16 15 14 13 10 -- -- */ \ | |
534 movdqu_r2r(xmm6, xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ | |
535 \ | |
536 pslldq_i2r(2, xmm7); /* xmm7 = -- -- 26 23 -- -- -- -- */ \ | |
537 pslldq_i2r(6, xmm1); /* xmm1 = 44 42 -- -- -- -- -- -- */ \ | |
538 \ | |
539 psrldq_i2r(2, xmm0); /* xmm0 = -- -- -- -- FF FF -- -- */ \ | |
540 pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- 04 01 -- -- -- */ \ | |
541 \ | |
542 pand_r2r(xmm2, xmm0); /* xmm0 = -- -- -- -- 13 10 -- -- */ \ | |
543 pxor_r2r(xmm6, xmm3); /* xmm3 = 07 06 05 -- -- -- -- -- */ \ | |
544 \ | |
545 pxor_r2r(xmm0, xmm2); /* xmm2 = 17 16 15 14 -- -- -- -- */ \ | |
546 psrldq_i2r(6, xmm6); /* xmm0 = -- -- -- -- -- -- 04 01 */ \ | |
547 \ | |
548 por_r2r(xmm7, xmm1); /* xmm1 = 44 42 26 23 -- -- -- -- */ \ | |
549 por_r2r(xmm6, xmm0); /* xmm1 = -- -- -- -- 13 10 04 01 */ \ | |
550 /* 12345 in use */ \ | |
551 por_r2r(xmm0, xmm1); /* xmm1 = 44 42 26 23 13 10 04 01 */ \ | |
552 pshuflw_r2r(xmm4, xmm4, 0x093); /* xmm4 = 27 -- -- -- 22 21 20 -- */ \ | |
553 \ | |
554 pshufhw_r2r(xmm4, xmm4, 0x093); /* xmm4 = -- -- -- 27 22 21 20 -- */ \ | |
555 movdqu_r2m(xmm1, *(eax + 16)); /* write 44 42 26 23 13 10 04 01 */ \ | |
556 \ | |
557 pshufhw_r2r(xmm3, xmm3, 0x0D2); /* xmm3 = 07 05 -- 06 -- -- -- -- */ \ | |
558 movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ | |
559 \ | |
560 pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 06 -- -- -- -- */ \ | |
561 psrldq_i2r(12, xmm3); /* xmm3 = -- -- -- -- -- -- 07 05 */ \ | |
562 \ | |
563 psrldq_i2r(8, xmm0); /* xmm0 = -- -- -- -- -- -- -- 06 */ \ | |
564 \ | |
565 movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
566 movdqu_m2r(*(ecx + 96), xmm7); /* xmm7 = -- -- -- -- FF FF -- -- */ \ | |
567 \ | |
568 pand_r2r(xmm4, xmm6); /* xmm6 = -- -- -- 27 22 -- -- -- */ \ | |
569 pxor_r2r(xmm6, xmm4); /* xmm4 = -- -- -- -- -- 21 20 -- */ \ | |
570 \ | |
571 por_r2r(xmm6, xmm3); /* xmm3 = -- -- -- 27 22 -- 07 05 */ \ | |
572 pand_r2r(xmm4, xmm7); /* xmm7 = -- -- -- -- -- 21 -- -- */ \ | |
573 \ | |
574 por_r2r(xmm7, xmm0); /* xmm0 = -- -- -- -- -- 21 -- 06 */ \ | |
575 pxor_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- -- 20 -- */ \ | |
576 \ | |
577 movdqu_m2r(*(ecx + 16 ), xmm6); /* xmm6 = -- -- FF FF -- -- -- -- */ \ | |
578 movdqu_m2r(*(ecx + 64 ), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ | |
579 \ | |
580 pand_r2r(xmm2, xmm6); /* xmm6 = -- -- 15 14 -- -- -- -- */ \ | |
581 pand_r2r(xmm6, xmm1); /* xmm1 = -- -- -- 14 -- -- -- -- */ \ | |
582 \ | |
583 pxor_r2r(xmm6, xmm2); /* xmm2 = 17 16 -- -- -- -- -- -- */ \ | |
584 pxor_r2r(xmm1, xmm6); /* xmm6 = -- -- 15 -- -- -- -- -- */ \ | |
585 \ | |
586 psrldq_i2r(4, xmm1); /* xmm1 = -- -- -- -- -- 14 -- -- */ \ | |
587 \ | |
588 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- 15 -- */ \ | |
589 por_r2r(xmm1, xmm3); /* xmm3 = -- -- -- 27 22 14 07 05 */ \ | |
590 \ | |
591 por_r2r(xmm6, xmm0); /* xmm0 = -- -- -- -- -- 21 15 06 */ \ | |
592 pshufhw_r2r(xmm5, xmm5, 0x0E1); /* xmm5 = 47 46 -- 45 -- -- 41 40 */ \ | |
593 \ | |
594 movdqu_m2r(*(ecx + 64), xmm1); /* xmm1 = -- -- -- FF FF -- -- -- */ \ | |
595 pshuflw_r2r(xmm5, xmm5, 0x072); /* xmm5 = 47 46 -- 45 41 -- 40 -- */ \ | |
596 \ | |
597 movdqu_r2r(xmm1, xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
598 pand_r2r(xmm5, xmm1); /* xmm1 = -- -- -- 45 41 -- -- -- */ \ | |
599 \ | |
600 pxor_r2r(xmm1, xmm5); /* xmm5 = 47 46 -- -- -- -- 40 -- */ \ | |
601 pslldq_i2r(4, xmm1); /* xmm1 = -- 45 41 -- -- -- -- -- */ \ | |
602 \ | |
603 pshufd_r2r(xmm5, xmm5, 0x09C); /* xmm5 = -- -- -- -- 47 46 40 -- */ \ | |
604 por_r2r(xmm1, xmm3); /* xmm3 = -- 45 41 27 22 14 07 05 */ \ | |
605 \ | |
606 movdqu_m2r(*(eax + 96), xmm1); /* xmm1 = 67 66 65 64 63 62 61 60 */ \ | |
607 pmullw_m2r(*(ebx + 96), xmm1); \ | |
608 \ | |
609 movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \ | |
610 \ | |
611 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ | |
612 pand_r2r(xmm5, xmm7); /* xmm7 = -- -- -- -- -- 46 40 -- */ \ | |
613 \ | |
614 pand_r2r(xmm1, xmm6); /* xmm6 = -- -- -- -- -- -- -- 60 */ \ | |
615 pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- -- -- 47 -- -- -- */ \ | |
616 \ | |
617 pxor_r2r(xmm6, xmm1); /* xmm1 = 67 66 65 64 63 62 61 -- */ \ | |
618 pslldq_i2r(2, xmm5); /* xmm5 = -- -- -- 47 -- -- -- -- */ \ | |
619 \ | |
620 pslldq_i2r(14, xmm6); /* xmm6 = 60 -- -- -- -- -- -- -- */ \ | |
621 por_r2r(xmm5, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 -- */ \ | |
622 \ | |
623 por_r2r(xmm6, xmm3); /* xmm3 = 60 45 41 27 22 14 07 05 */ \ | |
624 pslldq_i2r(6, xmm7); /* xmm7 = -- -- 46 40 -- -- -- -- */ \ | |
625 \ | |
626 movdqu_r2m(xmm3, *(eax+32)); /* write 60 45 41 27 22 14 07 05 */ \ | |
627 por_r2r(xmm7, xmm0); /* xmm0 = -- -- 46 40 -- 21 15 06 */ \ | |
628 /* 0, 1, 2, 4 in use */ \ | |
629 movdqu_m2r(*(eax + 48), xmm3); /* xmm3 = 37 36 35 34 33 32 31 30 */ \ | |
630 movdqu_m2r(*(eax + 80), xmm5); /* xmm5 = 57 56 55 54 53 52 51 50 */ \ | |
631 \ | |
632 pmullw_m2r(*(ebx + 48), xmm3); \ | |
633 pmullw_m2r(*(ebx + 80), xmm5); \ | |
634 \ | |
635 movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
636 movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ | |
637 \ | |
638 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ | |
639 pslldq_i2r(8, xmm7); /* xmm7 = FF -- -- -- -- -- -- -- */ \ | |
640 \ | |
641 pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- -- 30 */ \ | |
642 pand_r2r(xmm5, xmm7); /* xmm7 = 57 -- -- -- -- -- -- -- */ \ | |
643 \ | |
644 pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 31 -- */ \ | |
645 pxor_r2r(xmm7, xmm5); /* xmm5 = __ 56 55 54 53 52 51 50 */ \ | |
646 \ | |
647 pslldq_i2r(6, xmm6); /* xmm6 = -- -- -- -- 30 -- -- -- */ \ | |
648 psrldq_i2r(2, xmm7); /* xmm7 = -- 57 -- -- -- -- -- -- */ \ | |
649 \ | |
650 por_r2r(xmm7, xmm6); /* xmm6 = -- 57 -- -- 30 -- -- -- */ \ | |
651 movdqu_m2r(*(ecx), xmm7); /* xmm7 = -- -- -- -- -- FF FF -- */ \ | |
652 \ | |
653 por_r2r(xmm6, xmm0); /* xmm0 = -- 57 46 40 30 21 15 06 */ \ | |
654 psrldq_i2r(2, xmm7); /* xmm7 = -- -- -- -- -- -- FF FF */ \ | |
655 \ | |
656 movdqu_r2r(xmm2, xmm6); /* xmm6 = 17 16 -- -- -- -- -- -- */ \ | |
657 pand_r2r(xmm1, xmm7); /* xmm7 = -- -- -- -- -- -- 61 -- */ \ | |
658 \ | |
659 pslldq_i2r(2, xmm6); /* xmm6 = 16 -- -- -- -- -- -- -- */ \ | |
660 psrldq_i2r(14, xmm2); /* xmm2 = -- -- -- -- -- -- -- 17 */ \ | |
661 \ | |
662 pxor_r2r(xmm7, xmm1); /* xmm1 = 67 66 65 64 63 62 -- -- */ \ | |
663 pslldq_i2r(12, xmm7); /* xmm7 = 61 -- -- -- -- -- -- -- */ \ | |
664 \ | |
665 psrldq_i2r(14, xmm6); /* xmm6 = -- -- -- -- -- -- -- 16 */ \ | |
666 por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 -- -- 20 16 */ \ | |
667 \ | |
668 por_r2r(xmm7, xmm0); /* xmm0 = 61 57 46 40 30 21 15 06 */ \ | |
669 movdqu_m2r(*(ecx), xmm6); /* xmm6 = -- -- -- -- -- FF FF -- */ \ | |
670 \ | |
671 psrldq_i2r(2, xmm6); /* xmm6 = -- -- -- -- -- -- FF FF */ \ | |
672 movdqu_r2m(xmm0, *(eax+48)); /* write 61 57 46 40 30 21 15 06 */ \ | |
673 /* 1, 2, 3, 4, 5 in use */\ | |
674 movdqu_m2r(*(ecx), xmm0); /* xmm0 = -- -- -- -- -- FF FF -- */ \ | |
675 pand_r2r(xmm3, xmm6); /* xmm6 = -- -- -- -- -- -- 31 -- */ \ | |
676 \ | |
677 movdqu_r2r(xmm3, xmm7); /* xmm7 = 37 36 35 34 33 32 31 -- */ \ | |
678 pxor_r2r(xmm6, xmm3); /* xmm3 = 37 36 35 34 33 32 -- -- */ \ | |
679 \ | |
680 pslldq_i2r(2, xmm3); /* xmm3 = 36 35 34 33 32 -- -- -- */ \ | |
681 pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- -- 62 -- -- */ \ | |
682 \ | |
683 psrldq_i2r(14, xmm7); /* xmm7 = -- -- -- -- -- -- -- 37 */ \ | |
684 pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 63 -- -- -- */ \ | |
685 \ | |
686 por_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- 31 37 */ \ | |
687 movdqu_m2r(*(ecx + 64), xmm7); /* xmm7 = -- -- -- FF FF -- -- -- */ \ | |
688 \ | |
689 pshuflw_r2r(xmm6, xmm6, 0x01E); /* xmm6 = -- -- -- -- 37 31 -- -- */ \ | |
690 pslldq_i2r(6, xmm7); /* xmm7 = FF FF -- -- -- -- -- -- */ \ | |
691 \ | |
692 por_r2r(xmm6, xmm4); /* xmm4 = -- -- -- 47 37 31 20 16 */ \ | |
693 pand_r2r(xmm5, xmm7); /* xmm7 = -- 56 -- -- -- -- -- -- */ \ | |
694 \ | |
695 pslldq_i2r(8, xmm0); /* xmm0 = -- 62 -- -- -- -- -- -- */ \ | |
696 pxor_r2r(xmm7, xmm5); /* xmm5 = -- -- 55 54 53 52 51 50 */ \ | |
697 \ | |
698 psrldq_i2r(2, xmm7); /* xmm7 = -- -- 56 -- -- -- -- -- */ \ | |
699 \ | |
700 pshufhw_r2r(xmm3, xmm3, 0x087); /* xmm3 = 35 33 34 36 32 -- -- -- */ \ | |
701 por_r2r(xmm7, xmm0); /* xmm0 = -- 62 56 -- -- -- -- -- */ \ | |
702 \ | |
703 movdqu_m2r(*(eax + 112), xmm7); /* xmm7 = 77 76 75 74 73 72 71 70 */ \ | |
704 pmullw_m2r(*(ebx + 112), xmm7); \ | |
705 \ | |
706 movdqu_m2r(*(ecx + 64), xmm6); /* xmm6 = -- -- -- FF FF -- -- -- */ \ | |
707 por_r2r(xmm0, xmm4); /* xmm4 = -- 62 56 47 37 31 20 16 */ \ | |
708 \ | |
709 pshuflw_r2r(xmm7, xmm7, 0x0E1); /* xmm7 = 77 76 75 74 73 72 70 71 */ \ | |
710 psrldq_i2r(8, xmm6); /* xmm6 = -- -- -- -- -- -- -- FF */ \ | |
711 \ | |
712 movdqu_m2r(*(ecx + 64), xmm0); /* xmm0 = -- -- -- FF FF -- -- -- */ \ | |
713 pand_r2r(xmm7, xmm6); /* xmm6 = -- -- -- -- -- -- -- 71 */ \ | |
714 \ | |
715 pand_r2r(xmm3, xmm0); /* xmm0 = -- -- -- 36 32 -- -- -- */ \ | |
716 pxor_r2r(xmm6, xmm7); /* xmm7 = 77 76 75 74 73 72 70 -- */ \ | |
717 \ | |
718 pxor_r2r(xmm0, xmm3); /* xmm3 = 35 33 34 -- -- -- -- -- */ \ | |
719 pslldq_i2r(14, xmm6); /* xmm6 = 71 -- -- -- -- -- -- -- */ \ | |
720 \ | |
721 psrldq_i2r(4, xmm0); /* xmm0 = -- -- -- -- -- 36 32 -- */ \ | |
722 por_r2r(xmm6, xmm4); /* xmm4 = 71 62 56 47 37 31 20 16 */ \ | |
723 \ | |
724 por_r2r(xmm0, xmm2); /* xmm2 = -- -- -- -- -- 36 32 17 */ \ | |
725 movdqu_r2m(xmm4, *(eax + 64)); /* write 71 62 56 47 37 31 20 16 */ \ | |
726 /* 1, 2, 3, 5, 7 in use */ \ | |
727 movdqu_m2r(*(ecx + 80), xmm6); /* xmm6 = -- -- FF -- -- -- -- FF */ \ | |
728 pshufhw_r2r(xmm7, xmm7, 0x0D2); /* xmm7 = 77 75 74 76 73 72 70 __ */ \ | |
729 \ | |
730 movdqu_m2r(*(ecx), xmm4); /* xmm4 = -- -- -- -- -- FF FF -- */ \ | |
731 movdqu_m2r(*(ecx+48), xmm0); /* xmm0 = -- -- -- -- FF -- -- -- */ \ | |
732 \ | |
733 pand_r2r(xmm5, xmm6); /* xmm6 = -- -- 55 -- -- -- -- 50 */ \ | |
734 pand_r2r(xmm7, xmm4); /* xmm4 = -- -- -- -- -- 72 70 -- */ \ | |
735 \ | |
736 pand_r2r(xmm1, xmm0); /* xmm0 = -- -- -- -- 63 -- -- -- */ \ | |
737 pxor_r2r(xmm6, xmm5); /* xmm5 = -- -- -- 54 53 52 51 -- */ \ | |
738 \ | |
739 pxor_r2r(xmm4, xmm7); /* xmm7 = 77 75 74 76 73 -- -- -- */ \ | |
740 pxor_r2r(xmm0, xmm1); /* xmm1 = 67 66 65 64 -- -- -- -- */ \ | |
741 \ | |
742 pshuflw_r2r(xmm6, xmm6, 0x02B); /* xmm6 = -- -- 55 -- 50 -- -- -- */ \ | |
743 pslldq_i2r(10, xmm4); /* xmm4 = 72 20 -- -- -- -- -- -- */ \ | |
744 \ | |
745 pshufhw_r2r(xmm6, xmm6, 0x0B1); /* xmm6 = -- -- -- 55 50 -- -- -- */ \ | |
746 pslldq_i2r(4, xmm0); /* xmm0 = -- -- 63 -- -- -- -- -- */ \ | |
747 \ | |
748 por_r2r(xmm4, xmm6); /* xmm6 = 72 70 -- 55 50 -- -- -- */ \ | |
749 por_r2r(xmm0, xmm2); /* xmm2 = -- -- 63 -- -- 36 32 17 */ \ | |
750 \ | |
751 por_r2r(xmm6, xmm2); /* xmm2 = 72 70 64 55 50 36 32 17 */ \ | |
752 pshufhw_r2r(xmm1, xmm1, 0x0C9); /* xmm1 = 67 64 66 65 -- -- -- -- */ \ | |
753 \ | |
754 movdqu_r2r(xmm3, xmm6); /* xmm6 = 35 33 34 -- -- -- -- -- */ \ | |
755 movdqu_r2m(xmm2, *(eax+80)); /* write 72 70 64 55 50 36 32 17 */ \ | |
756 \ | |
757 psrldq_i2r(12, xmm6); /* xmm6 = -- -- -- -- -- -- 35 33 */ \ | |
758 pslldq_i2r(4, xmm3); /* xmm3 = 34 -- -- -- -- -- -- -- */ \ | |
759 \ | |
760 pshuflw_r2r(xmm5, xmm5, 0x04E); /* xmm5 = -- -- -- 54 51 -- 53 52 */ \ | |
761 movdqu_r2r(xmm7, xmm4); /* xmm4 = 77 75 74 76 73 -- -- -- */ \ | |
762 \ | |
763 movdqu_r2r(xmm5, xmm2); /* xmm2 = -- -- -- 54 51 -- 53 52 */ \ | |
764 psrldq_i2r(10, xmm7); /* xmm7 = -- -- -- -- -- 77 75 74 */ \ | |
765 \ | |
766 pslldq_i2r(6, xmm4); /* xmm4 = 76 73 -- -- -- -- -- -- */ \ | |
767 pslldq_i2r(12, xmm2); /* xmm2 = 53 52 -- -- -- -- -- -- */ \ | |
768 \ | |
769 movdqu_r2r(xmm1, xmm0); /* xmm0 = 67 64 66 65 -- -- -- -- */ \ | |
770 psrldq_i2r(12, xmm1); /* xmm1 = -- -- -- -- -- -- 67 64 */ \ | |
771 \ | |
772 psrldq_i2r(6, xmm5); /* xmm5 = -- -- -- -- -- -- 54 51 */ \ | |
773 psrldq_i2r(14, xmm3); /* xmm3 = -- -- -- -- -- -- -- 34 */ \ | |
774 \ | |
775 pslldq_i2r(10, xmm7); /* xmm7 = 77 75 74 -- -- -- -- -- */ \ | |
776 por_r2r(xmm6, xmm4); /* xmm4 = 76 73 -- -- -- -- 35 33 */ \ | |
777 \ | |
778 psrldq_i2r(10, xmm2); /* xmm2 = -- -- -- -- -- 53 52 -- */ \ | |
779 pslldq_i2r(4, xmm0); /* xmm0 = 66 65 -- -- -- -- -- -- */ \ | |
780 \ | |
781 pslldq_i2r(8, xmm1); /* xmm1 = -- -- 67 64 -- -- -- -- */ \ | |
782 por_r2r(xmm7, xmm3); /* xmm3 = 77 75 74 -- -- -- -- 34 */ \ | |
783 \ | |
784 psrldq_i2r(6, xmm0); /* xmm0 = -- -- -- 66 65 -- -- -- */ \ | |
785 pslldq_i2r(4, xmm5); /* xmm5 = -- -- -- -- 54 51 -- -- */ \ | |
786 \ | |
787 por_r2r(xmm1, xmm4); /* xmm4 = 76 73 67 64 -- -- 35 33 */ \ | |
788 por_r2r(xmm2, xmm3); /* xmm3 = 77 75 74 -- -- 53 52 34 */ \ | |
789 \ | |
790 por_r2r(xmm5, xmm4); /* xmm4 = 76 73 67 64 54 51 35 33 */ \ | |
791 por_r2r(xmm0, xmm3); /* xmm3 = 77 75 74 66 65 53 52 34 */ \ | |
792 \ | |
793 movdqu_r2m(xmm4, *(eax+96)); /* write 76 73 67 64 54 51 35 33 */ \ | |
794 movdqu_r2m(xmm3, *(eax+112)); /* write 77 75 74 66 65 53 52 34 */ \ | |
795 \ | |
796 } /* end of SSE2_Dequantize Macro */ | |
797 | |
798 | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1978
diff
changeset
|
799 void ff_vp3_idct_sse2(int16_t *input_data) |
1970 | 800 { |
801 unsigned char *input_bytes = (unsigned char *)input_data; | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1978
diff
changeset
|
802 unsigned char *output_data_bytes = (unsigned char *)input_data; |
1970 | 803 unsigned char *idct_data_bytes = (unsigned char *)SSE2_idct_data; |
804 unsigned char *Eight = (unsigned char *)eight_data; | |
805 | |
806 #define eax input_bytes | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1978
diff
changeset
|
807 //#define ebx dequant_matrix_bytes |
1970 | 808 #define ecx dequant_const_bytes |
809 #define edx idct_data_bytes | |
810 | |
811 #define I(i) (eax + 16 * i) | |
812 #define O(i) (ebx + 16 * i) | |
813 #define C(i) (edx + 16 * (i-1)) | |
814 | |
2696
9699d325049d
porting the mmx&sse2 (sse2 untested) vp3 idcts to the lavc idct API
michael
parents:
1978
diff
changeset
|
815 // SSE2_Dequantize(); |
1970 | 816 |
817 #undef ebx | |
818 #define ebx output_data_bytes | |
819 | |
820 SSE2_Row_IDCT(); | |
821 | |
822 SSE2_Transpose(); | |
823 | |
824 SSE2_Column_IDCT(); | |
825 } |