Mercurial > libavcodec.hg
comparison i386/vp3dsp_mmx.c @ 1866:1755f959ab7f libavcodec
seperated out the C-based VP3 DSP functions into a different file; also
ported the MMX-optimized versions of those functions
author | melanson |
---|---|
date | Mon, 08 Mar 2004 02:13:45 +0000 |
parents | |
children | 56cb752222cc |
comparison
equal
deleted
inserted
replaced
1865:92e94c550def | 1866:1755f959ab7f |
---|---|
1 /* | |
2 * Copyright (C) 2004 the ffmpeg project | |
3 * | |
4 * This library is free software; you can redistribute it and/or | |
5 * modify it under the terms of the GNU Lesser General Public | |
6 * License as published by the Free Software Foundation; either | |
7 * version 2 of the License, or (at your option) any later version. | |
8 * | |
9 * This library is distributed in the hope that it will be useful, | |
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
12 * Lesser General Public License for more details. | |
13 * | |
14 * You should have received a copy of the GNU Lesser General Public | |
15 * License along with this library; if not, write to the Free Software | |
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
17 */ | |
18 | |
19 /** | |
20 * @file vp3dsp_mmx.c | |
21 * MMX-optimized functions cribbed from the original VP3 source code. | |
22 */ | |
23 | |
24 #include "../dsputil.h" | |
25 #include "mmx.h" | |
26 | |
27 #define IdctAdjustBeforeShift 8 | |
28 | |
29 /* (12 * 4) 2-byte memory locations ( = 96 bytes total) | |
30 * idct_constants[0..15] = Mask table (M(I)) | |
31 * idct_constants[16..43] = Cosine table (C(I)) | |
32 * idct_constants[44..47] = 8 | |
33 */ | |
34 static uint16_t idct_constants[(4 + 7 + 1) * 4]; | |
35 static uint16_t idct_cosine_table[7] = { | |
36 64277, 60547, 54491, 46341, 36410, 25080, 12785 | |
37 }; | |
38 | |
39 #define r0 mm0 | |
40 #define r1 mm1 | |
41 #define r2 mm2 | |
42 #define r3 mm3 | |
43 #define r4 mm4 | |
44 #define r5 mm5 | |
45 #define r6 mm6 | |
46 #define r7 mm7 | |
47 | |
48 /* from original comments: The Macro does IDct on 4 1-D Dcts */ | |
49 #define BeginIDCT() \ | |
50 movq_m2r(*I(3), r2); \ | |
51 movq_m2r(*C(3), r6); \ | |
52 movq_r2r(r2, r4); \ | |
53 movq_m2r(*J(5), r7); \ | |
54 pmulhw_r2r(r6, r4); \ | |
55 movq_m2r(*C(5), r1); \ | |
56 pmulhw_r2r(r7, r6); \ | |
57 movq_r2r(r1, r5); \ | |
58 pmulhw_r2r(r2, r1); \ | |
59 movq_m2r(*I(1), r3); \ | |
60 pmulhw_r2r(r7, r5); \ | |
61 movq_m2r(*C(1), r0); \ | |
62 paddw_r2r(r2, r4); \ | |
63 paddw_r2r(r7, r6); \ | |
64 paddw_r2r(r1, r2); \ | |
65 movq_m2r(*J(7), r1); \ | |
66 paddw_r2r(r5, r7); \ | |
67 movq_r2r(r0, r5); \ | |
68 pmulhw_r2r(r3, r0); \ | |
69 paddsw_r2r(r7, r4); \ | |
70 pmulhw_r2r(r1, r5); \ | |
71 movq_m2r(*C(7), r7); \ | |
72 psubsw_r2r(r2, r6); \ | |
73 paddw_r2r(r3, r0); \ | |
74 pmulhw_r2r(r7, r3); \ | |
75 movq_m2r(*I(2), r2); \ | |
76 pmulhw_r2r(r1, r7); \ | |
77 paddw_r2r(r1, r5); \ | |
78 movq_r2r(r2, r1); \ | |
79 pmulhw_m2r(*C(2), r2); \ | |
80 psubsw_r2r(r5, r3); \ | |
81 movq_m2r(*J(6), r5); \ | |
82 paddsw_r2r(r7, r0); \ | |
83 movq_r2r(r5, r7); \ | |
84 psubsw_r2r(r4, r0); \ | |
85 pmulhw_m2r(*C(2), r5); \ | |
86 paddw_r2r(r1, r2); \ | |
87 pmulhw_m2r(*C(6), r1); \ | |
88 paddsw_r2r(r4, r4); \ | |
89 paddsw_r2r(r0, r4); \ | |
90 psubsw_r2r(r6, r3); \ | |
91 paddw_r2r(r7, r5); \ | |
92 paddsw_r2r(r6, r6); \ | |
93 pmulhw_m2r(*C(6), r7); \ | |
94 paddsw_r2r(r3, r6); \ | |
95 movq_r2m(r4, *I(1)); \ | |
96 psubsw_r2r(r5, r1); \ | |
97 movq_m2r(*C(4), r4); \ | |
98 movq_r2r(r3, r5); \ | |
99 pmulhw_r2r(r4, r3); \ | |
100 paddsw_r2r(r2, r7); \ | |
101 movq_r2m(r6, *I(2)); \ | |
102 movq_r2r(r0, r2); \ | |
103 movq_m2r(*I(0), r6); \ | |
104 pmulhw_r2r(r4, r0); \ | |
105 paddw_r2r(r3, r5); \ | |
106 movq_m2r(*J(4), r3); \ | |
107 psubsw_r2r(r1, r5); \ | |
108 paddw_r2r(r0, r2); \ | |
109 psubsw_r2r(r3, r6); \ | |
110 movq_r2r(r6, r0); \ | |
111 pmulhw_r2r(r4, r6); \ | |
112 paddsw_r2r(r3, r3); \ | |
113 paddsw_r2r(r1, r1); \ | |
114 paddsw_r2r(r0, r3); \ | |
115 paddsw_r2r(r5, r1); \ | |
116 pmulhw_r2r(r3, r4); \ | |
117 paddsw_r2r(r0, r6); \ | |
118 psubsw_r2r(r2, r6); \ | |
119 paddsw_r2r(r2, r2); \ | |
120 movq_m2r(*I(1), r0); \ | |
121 paddsw_r2r(r6, r2); \ | |
122 paddw_r2r(r3, r4); \ | |
123 psubsw_r2r(r1, r2); | |
124 | |
125 /* RowIDCT gets ready to transpose */ | |
126 #define RowIDCT() \ | |
127 \ | |
128 BeginIDCT() \ | |
129 \ | |
130 movq_m2r(*I(2), r3); \ | |
131 psubsw_r2r(r7, r4); \ | |
132 paddsw_r2r(r1, r1); \ | |
133 paddsw_r2r(r7, r7); \ | |
134 paddsw_r2r(r2, r1); \ | |
135 paddsw_r2r(r4, r7); \ | |
136 psubsw_r2r(r3, r4); \ | |
137 psubsw_r2r(r5, r6); \ | |
138 paddsw_r2r(r5, r5); \ | |
139 paddsw_r2r(r4, r3); \ | |
140 paddsw_r2r(r6, r5); \ | |
141 psubsw_r2r(r0, r7); \ | |
142 paddsw_r2r(r0, r0); \ | |
143 movq_r2m(r1, *I(1)); \ | |
144 paddsw_r2r(r7, r0); | |
145 | |
146 /* Column IDCT normalizes and stores final results */ | |
147 #define ColumnIDCT() \ | |
148 \ | |
149 BeginIDCT() \ | |
150 \ | |
151 paddsw_m2r(*Eight, r2); \ | |
152 paddsw_r2r(r1, r1); \ | |
153 paddsw_r2r(r2, r1); \ | |
154 psraw_i2r(4, r2); \ | |
155 psubsw_r2r(r7, r4); \ | |
156 psraw_i2r(4, r1); \ | |
157 movq_m2r(*I(2), r3); \ | |
158 paddsw_r2r(r7, r7); \ | |
159 movq_r2m(r2, *I(2)); \ | |
160 paddsw_r2r(r4, r7); \ | |
161 movq_r2m(r1, *I(1)); \ | |
162 psubsw_r2r(r3, r4); \ | |
163 paddsw_m2r(*Eight, r4); \ | |
164 paddsw_r2r(r3, r3); \ | |
165 paddsw_r2r(r4, r3); \ | |
166 psraw_i2r(4, r4); \ | |
167 psubsw_r2r(r5, r6); \ | |
168 psraw_i2r(4, r3); \ | |
169 paddsw_m2r(*Eight, r6); \ | |
170 paddsw_r2r(r5, r5); \ | |
171 paddsw_r2r(r6, r5); \ | |
172 psraw_i2r(4, r6); \ | |
173 movq_r2m(r4, *J(4)); \ | |
174 psraw_i2r(4, r5); \ | |
175 movq_r2m(r3, *I(3)); \ | |
176 psubsw_r2r(r0, r7); \ | |
177 paddsw_m2r(*Eight, r7); \ | |
178 paddsw_r2r(r0, r0); \ | |
179 paddsw_r2r(r7, r0); \ | |
180 psraw_i2r(4, r7); \ | |
181 movq_r2m(r6, *J(6)); \ | |
182 psraw_i2r(4, r0); \ | |
183 movq_r2m(r5, *J(5)); \ | |
184 movq_r2m(r7, *J(7)); \ | |
185 movq_r2m(r0, *I(0)); | |
186 | |
187 | |
188 /* Following macro does two 4x4 transposes in place. | |
189 | |
190 At entry (we assume): | |
191 | |
192 r0 = a3 a2 a1 a0 | |
193 I(1) = b3 b2 b1 b0 | |
194 r2 = c3 c2 c1 c0 | |
195 r3 = d3 d2 d1 d0 | |
196 | |
197 r4 = e3 e2 e1 e0 | |
198 r5 = f3 f2 f1 f0 | |
199 r6 = g3 g2 g1 g0 | |
200 r7 = h3 h2 h1 h0 | |
201 | |
202 At exit, we have: | |
203 | |
204 I(0) = d0 c0 b0 a0 | |
205 I(1) = d1 c1 b1 a1 | |
206 I(2) = d2 c2 b2 a2 | |
207 I(3) = d3 c3 b3 a3 | |
208 | |
209 J(4) = h0 g0 f0 e0 | |
210 J(5) = h1 g1 f1 e1 | |
211 J(6) = h2 g2 f2 e2 | |
212 J(7) = h3 g3 f3 e3 | |
213 | |
214 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. | |
215 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. | |
216 | |
217 Since r1 is free at entry, we calculate the Js first. */ | |
218 | |
219 #define Transpose() \ | |
220 movq_r2r(r4, r1); \ | |
221 punpcklwd_r2r(r5, r4); \ | |
222 movq_r2m(r0, *I(0)); \ | |
223 punpckhwd_r2r(r5, r1); \ | |
224 movq_r2r(r6, r0); \ | |
225 punpcklwd_r2r(r7, r6); \ | |
226 movq_r2r(r4, r5); \ | |
227 punpckldq_r2r(r6, r4); \ | |
228 punpckhdq_r2r(r6, r5); \ | |
229 movq_r2r(r1, r6); \ | |
230 movq_r2m(r4, *J(4)); \ | |
231 punpckhwd_r2r(r7, r0); \ | |
232 movq_r2m(r5, *J(5)); \ | |
233 punpckhdq_r2r(r0, r6); \ | |
234 movq_m2r(*I(0), r4); \ | |
235 punpckldq_r2r(r0, r1); \ | |
236 movq_m2r(*I(1), r5); \ | |
237 movq_r2r(r4, r0); \ | |
238 movq_r2m(r6, *J(7)); \ | |
239 punpcklwd_r2r(r5, r0); \ | |
240 movq_r2m(r1, *J(6)); \ | |
241 punpckhwd_r2r(r5, r4); \ | |
242 movq_r2r(r2, r5); \ | |
243 punpcklwd_r2r(r3, r2); \ | |
244 movq_r2r(r0, r1); \ | |
245 punpckldq_r2r(r2, r0); \ | |
246 punpckhdq_r2r(r2, r1); \ | |
247 movq_r2r(r4, r2); \ | |
248 movq_r2m(r0, *I(0)); \ | |
249 punpckhwd_r2r(r3, r5); \ | |
250 movq_r2m(r1, *I(1)); \ | |
251 punpckhdq_r2r(r5, r4); \ | |
252 punpckldq_r2r(r5, r2); \ | |
253 movq_r2m(r4, *I(3)); \ | |
254 movq_r2m(r2, *I(2)); | |
255 | |
256 | |
257 void vp3_dsp_init_mmx(void) | |
258 { | |
259 int j = 16; | |
260 uint16_t *p; | |
261 | |
262 do { | |
263 idct_constants[--j] = 0; | |
264 } while (j); | |
265 | |
266 idct_constants[0] = idct_constants[5] = | |
267 idct_constants[10] = idct_constants[15] = 65535; | |
268 | |
269 j = 1; | |
270 do { | |
271 p = idct_constants + ((j + 3) << 2); | |
272 p[0] = p[1] = p[2] = p[3] = idct_cosine_table[j - 1]; | |
273 } while (++j <= 7); | |
274 | |
275 idct_constants[44] = idct_constants[45] = | |
276 idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift; | |
277 } | |
278 | |
279 static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix, | |
280 int16_t *output_data) | |
281 { | |
282 /* eax = quantized input | |
283 * ebx = dequantizer matrix | |
284 * ecx = IDCT constants | |
285 * M(I) = ecx + MaskOffset(0) + I * 8 | |
286 * C(I) = ecx + CosineOffset(32) + (I-1) * 8 | |
287 * edx = output | |
288 * r0..r7 = mm0..mm7 | |
289 */ | |
290 | |
291 #define M(x) (idct_constants + x * 4) | |
292 #define C(x) (idct_constants + 16 + (x - 1) * 4) | |
293 #define Eight (idct_constants + 44) | |
294 | |
295 movq_m2r(*input_data, r0); | |
296 pmullw_m2r(*dequant_matrix, r0); | |
297 movq_m2r(*(input_data + 8), r1); | |
298 pmullw_m2r(*(dequant_matrix + 8), r1); | |
299 movq_m2r(*M(0), r2); | |
300 movq_r2r(r0, r3); | |
301 movq_m2r(*(input_data + 4), r4); | |
302 psrlq_i2r(16, r0); | |
303 pmullw_m2r(*(dequant_matrix + 4), r4); | |
304 pand_r2r(r2, r3); | |
305 movq_r2r(r0, r5); | |
306 movq_r2r(r1, r6); | |
307 pand_r2r(r2, r5); | |
308 psllq_i2r(32, r6); | |
309 movq_m2r(*M(3), r7); | |
310 pxor_r2r(r5, r0); | |
311 pand_r2r(r6, r7); | |
312 por_r2r(r3, r0); | |
313 pxor_r2r(r7, r6); | |
314 por_r2r(r7, r0); | |
315 movq_m2r(*M(3), r7); | |
316 movq_r2r(r4, r3); | |
317 movq_r2m(r0, *output_data); | |
318 | |
319 pand_r2r(r2, r3); | |
320 movq_m2r(*(input_data + 16), r0); | |
321 psllq_i2r(16, r3); | |
322 pmullw_m2r(*(dequant_matrix + 16), r0); | |
323 pand_r2r(r1, r7); | |
324 por_r2r(r3, r5); | |
325 por_r2r(r6, r7); | |
326 movq_m2r(*(input_data + 12), r3); | |
327 por_r2r(r5, r7); | |
328 pmullw_m2r(*(dequant_matrix + 12), r3); | |
329 psrlq_i2r(16, r4); | |
330 movq_r2m(r7, *(output_data + 8)); | |
331 | |
332 movq_r2r(r4, r5); | |
333 movq_r2r(r0, r7); | |
334 psrlq_i2r(16, r4); | |
335 psrlq_i2r(48, r7); | |
336 movq_r2r(r2, r6); | |
337 pand_r2r(r2, r5); | |
338 pand_r2r(r4, r6); | |
339 movq_r2m(r7, *(output_data + 40)); | |
340 | |
341 pxor_r2r(r6, r4); | |
342 psrlq_i2r(32, r1); | |
343 por_r2r(r5, r4); | |
344 movq_m2r(*M(3), r7); | |
345 pand_r2r(r2, r1); | |
346 movq_m2r(*(input_data + 24), r5); | |
347 psllq_i2r(16, r0); | |
348 pmullw_m2r(*(dequant_matrix + 24), r5); | |
349 pand_r2r(r0, r7); | |
350 movq_r2m(r1, *(output_data + 32)); | |
351 | |
352 por_r2r(r4, r7); | |
353 movq_r2r(r3, r4); | |
354 pand_r2r(r2, r3); | |
355 movq_m2r(*M(2), r1); | |
356 psllq_i2r(32, r3); | |
357 por_r2r(r3, r7); | |
358 movq_r2r(r5, r3); | |
359 psllq_i2r(48, r3); | |
360 pand_r2r(r0, r1); | |
361 movq_r2m(r7, *(output_data + 16)); | |
362 | |
363 por_r2r(r3, r6); | |
364 movq_m2r(*M(1), r7); | |
365 por_r2r(r1, r6); | |
366 movq_m2r(*(input_data + 28), r1); | |
367 pand_r2r(r4, r7); | |
368 pmullw_m2r(*(dequant_matrix + 28), r1); | |
369 por_r2r(r6, r7); | |
370 pand_m2r(*M(1), r0); | |
371 psrlq_i2r(32, r4); | |
372 movq_r2m(r7, *(output_data + 24)); | |
373 | |
374 movq_r2r(r4, r6); | |
375 movq_m2r(*M(3), r7); | |
376 pand_r2r(r2, r4); | |
377 movq_m2r(*M(1), r3); | |
378 pand_r2r(r1, r7); | |
379 pand_r2r(r5, r3); | |
380 por_r2r(r4, r0); | |
381 psllq_i2r(16, r3); | |
382 por_r2r(r0, r7); | |
383 movq_m2r(*M(2), r4); | |
384 por_r2r(r3, r7); | |
385 movq_m2r(*(input_data + 40), r0); | |
386 movq_r2r(r4, r3); | |
387 pmullw_m2r(*(dequant_matrix + 40), r0); | |
388 pand_r2r(r5, r4); | |
389 movq_r2m(r7, *(output_data + 4)); | |
390 | |
391 por_r2r(r4, r6); | |
392 movq_r2r(r3, r4); | |
393 psrlq_i2r(16, r6); | |
394 movq_r2r(r0, r7); | |
395 pand_r2r(r1, r4); | |
396 psllq_i2r(48, r7); | |
397 por_r2r(r4, r6); | |
398 movq_m2r(*(input_data + 44), r4); | |
399 por_r2r(r6, r7); | |
400 pmullw_m2r(*(dequant_matrix + 44), r4); | |
401 psrlq_i2r(16, r3); | |
402 movq_r2m(r7, *(output_data + 12)); | |
403 | |
404 pand_r2r(r1, r3); | |
405 psrlq_i2r(48, r5); | |
406 pand_r2r(r2, r1); | |
407 movq_m2r(*(input_data + 52), r6); | |
408 por_r2r(r3, r5); | |
409 pmullw_m2r(*(input_data + 52), r6); | |
410 psrlq_i2r(16, r0); | |
411 movq_r2r(r4, r7); | |
412 movq_r2r(r2, r3); | |
413 psllq_i2r(48, r7); | |
414 pand_r2r(r0, r3); | |
415 pxor_r2r(r3, r0); | |
416 psllq_i2r(32, r3); | |
417 por_r2r(r5, r7); | |
418 movq_r2r(r6, r5); | |
419 pand_m2r(*M(1), r6); | |
420 por_r2r(r3, r7); | |
421 psllq_i2r(32, r6); | |
422 por_r2r(r1, r0); | |
423 movq_r2m(r7, *(output_data + 20)); | |
424 | |
425 por_r2r(r6, r0); | |
426 movq_m2r(*(input_data + 60), r7); | |
427 movq_r2r(r5, r6); | |
428 pmullw_m2r(*(input_data + 60), r7); | |
429 psrlq_i2r(32, r5); | |
430 pand_r2r(r2, r6); | |
431 movq_r2r(r5, r1); | |
432 movq_r2m(r0, *(output_data + 28)); | |
433 | |
434 pand_r2r(r2, r1); | |
435 movq_m2r(*(input_data + 56), r0); | |
436 movq_r2r(r7, r3); | |
437 pmullw_m2r(*(dequant_matrix + 56), r0); | |
438 psllq_i2r(16, r3); | |
439 pand_m2r(*M(3), r7); | |
440 pxor_r2r(r1, r5); | |
441 por_r2r(r5, r6); | |
442 movq_r2r(r3, r5); | |
443 pand_m2r(*M(3), r5); | |
444 por_r2r(r1, r7); | |
445 movq_m2r(*(input_data + 48), r1); | |
446 pxor_r2r(r5, r3); | |
447 pmullw_m2r(*(dequant_matrix + 48), r1); | |
448 por_r2r(r3, r7); | |
449 por_r2r(r5, r6); | |
450 movq_r2r(r0, r5); | |
451 movq_r2m(r7, *(output_data + 60)); | |
452 | |
453 psrlq_i2r(16, r5); | |
454 pand_m2r(*M(2), r5); | |
455 movq_r2r(r0, r7); | |
456 por_r2r(r5, r6); | |
457 pand_r2r(r2, r0); | |
458 pxor_r2r(r0, r7); | |
459 psllq_i2r(32, r0); | |
460 movq_r2m(r6, *(output_data + 52)); | |
461 | |
462 psrlq_i2r(16, r4); | |
463 movq_m2r(*(input_data + 36), r5); | |
464 psllq_i2r(16, r7); | |
465 pmullw_m2r(*(dequant_matrix + 36), r5); | |
466 movq_r2r(r7, r6); | |
467 movq_m2r(*M(2), r3); | |
468 psllq_i2r(16, r6); | |
469 pand_m2r(*M(3), r7); | |
470 pand_r2r(r1, r3); | |
471 por_r2r(r0, r7); | |
472 movq_r2r(r1, r0); | |
473 pand_m2r(*M(3), r1); | |
474 por_r2r(r3, r6); | |
475 movq_r2r(r4, r3); | |
476 psrlq_i2r(32, r1); | |
477 pand_r2r(r2, r3); | |
478 por_r2r(r1, r7); | |
479 por_r2r(r3, r7); | |
480 movq_r2r(r4, r3); | |
481 pand_m2r(*M(1), r3); | |
482 movq_r2r(r5, r1); | |
483 movq_r2m(r7, *(output_data + 44)); | |
484 | |
485 psrlq_i2r(48, r5); | |
486 movq_m2r(*(input_data + 32), r7); | |
487 por_r2r(r3, r6); | |
488 pmullw_m2r(*(dequant_matrix + 32), r7); | |
489 por_r2r(r5, r6); | |
490 pand_m2r(*M(2), r4); | |
491 psllq_i2r(32, r0); | |
492 movq_r2m(r6, *(output_data + 36)); | |
493 | |
494 movq_r2r(r0, r6); | |
495 pand_m2r(*M(3), r0); | |
496 psllq_i2r(16, r6); | |
497 movq_m2r(*(input_data + 20), r5); | |
498 movq_r2r(r1, r3); | |
499 pmullw_m2r(*(dequant_matrix + 40), r5); | |
500 psrlq_i2r(16, r1); | |
501 pand_m2r(*M(1), r1); | |
502 por_r2r(r4, r0); | |
503 pand_r2r(r7, r2); | |
504 por_r2r(r1, r0); | |
505 por_r2r(r2, r0); | |
506 psllq_i2r(16, r3); | |
507 movq_r2r(r3, r4); | |
508 movq_r2r(r5, r2); | |
509 movq_r2m(r0, *(output_data + 56)); | |
510 | |
511 psrlq_i2r(48, r2); | |
512 pand_m2r(*M(2), r4); | |
513 por_r2r(r2, r6); | |
514 movq_m2r(*M(1), r2); | |
515 por_r2r(r4, r6); | |
516 pand_r2r(r7, r2); | |
517 psllq_i2r(32, r3); | |
518 por_m2r(*(output_data + 40), r3); | |
519 | |
520 por_r2r(r2, r6); | |
521 movq_m2r(*M(3), r2); | |
522 psllq_i2r(16, r5); | |
523 movq_r2m(r6, *(output_data + 48)); | |
524 | |
525 pand_r2r(r5, r2); | |
526 movq_m2r(*M(2), r6); | |
527 pxor_r2r(r2, r5); | |
528 pand_r2r(r7, r6); | |
529 psrlq_i2r(32, r2); | |
530 pand_m2r(*M(3), r7); | |
531 por_r2r(r2, r3); | |
532 por_m2r(*(output_data + 32), r7); | |
533 | |
534 por_r2r(r3, r6); | |
535 por_r2r(r5, r7); | |
536 movq_r2m(r6, *(output_data + 40)); | |
537 movq_r2m(r7, *(output_data + 32)); | |
538 | |
539 | |
540 #undef M | |
541 | |
542 /* at this point, function has completed dequantization + dezigzag + | |
543 * partial transposition; now do the idct itself */ | |
544 | |
545 #define I(K) (output_data + K * 8) | |
546 #define J(K) (output_data + ((K - 4) * 8) + 4) | |
547 | |
548 RowIDCT(); | |
549 Transpose(); | |
550 | |
551 #undef I | |
552 #undef J | |
553 #define I(K) (output_data + (K * 8) + 32) | |
554 #define J(K) (output_data + ((K - 4) * 8) + 36) | |
555 | |
556 RowIDCT(); | |
557 Transpose(); | |
558 | |
559 #undef I | |
560 #undef J | |
561 #define I(K) (output_data + K * 8) | |
562 #define J(K) (output_data + K * 8) | |
563 | |
564 ColumnIDCT(); | |
565 | |
566 #undef I | |
567 #undef J | |
568 #define I(K) (output_data + (K * 8) + 4) | |
569 #define J(K) (output_data + (K * 8) + 4) | |
570 | |
571 ColumnIDCT(); | |
572 | |
573 #undef I | |
574 #undef J | |
575 | |
576 } | |
577 | |
578 void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix, | |
579 int coeff_count, uint8_t *dest, int stride) | |
580 { | |
581 int16_t transformed_data[64]; | |
582 int16_t *op; | |
583 int i, j; | |
584 uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; | |
585 | |
586 vp3_idct_mmx(input_data, dequant_matrix, transformed_data); | |
587 | |
588 /* place in final output */ | |
589 op = transformed_data; | |
590 movq_m2r(*vector128, mm0); | |
591 for (i = 0; i < 8; i++) { | |
592 #if 1 | |
593 for (j = 0; j < 8; j++) { | |
594 if (*op < -128) | |
595 *dest = 0; | |
596 else if (*op > 127) | |
597 *dest = 255; | |
598 else | |
599 *dest = (uint8_t)(*op + 128); | |
600 op++; | |
601 dest++; | |
602 } | |
603 dest += (stride - 8); | |
604 #else | |
605 /* prototype optimization */ | |
606 pxor_r2r(mm1, mm1); | |
607 packsswb_m2r(*(op + 4), mm1); | |
608 movq_r2r(mm1, mm2); | |
609 psrlq_i2r(32, mm2); | |
610 packsswb_m2r(*(op + 0), mm1); | |
611 op += 8; | |
612 por_r2r(mm2, mm1); | |
613 paddb_r2r(mm0, mm1); | |
614 movq_r2m(mm1, *dest); | |
615 dest += stride; | |
616 #endif | |
617 } | |
618 | |
619 /* be a good MMX citizen */ | |
620 emms(); | |
621 } | |
622 | |
623 void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix, | |
624 int coeff_count, uint8_t *dest, int stride) | |
625 { | |
626 int16_t transformed_data[64]; | |
627 int16_t *op; | |
628 int i, j; | |
629 int16_t sample; | |
630 | |
631 vp3_idct_mmx(input_data, dequant_matrix, transformed_data); | |
632 | |
633 /* place in final output */ | |
634 op = transformed_data; | |
635 for (i = 0; i < 8; i++) { | |
636 for (j = 0; j < 8; j++) { | |
637 sample = *dest + *op; | |
638 if (sample < 0) | |
639 *dest = 0; | |
640 else if (sample > 255) | |
641 *dest = 255; | |
642 else | |
643 *dest = (uint8_t)(sample & 0xFF); | |
644 op++; | |
645 dest++; | |
646 } | |
647 dest += (stride - 8); | |
648 } | |
649 | |
650 /* be a good MMX citizen */ | |
651 emms(); | |
652 } |