Mercurial > mplayer.hg
comparison libmpeg2/motion_comp_mmx.c @ 1:3b5f5d1c5041
Initial revision
author | arpi_esp |
---|---|
date | Sat, 24 Feb 2001 20:28:24 +0000 |
parents | |
children | 846535ace7a2 |
comparison
equal
deleted
inserted
replaced
0:c1bb2c071d63 | 1:3b5f5d1c5041 |
---|---|
1 /* | |
2 * motion_comp_mmx.c | |
3 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | |
4 * | |
5 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
6 * | |
7 * mpeg2dec is free software; you can redistribute it and/or modify | |
8 * it under the terms of the GNU General Public License as published by | |
9 * the Free Software Foundation; either version 2 of the License, or | |
10 * (at your option) any later version. | |
11 * | |
12 * mpeg2dec is distributed in the hope that it will be useful, | |
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 * GNU General Public License for more details. | |
16 * | |
17 * You should have received a copy of the GNU General Public License | |
18 * along with this program; if not, write to the Free Software | |
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
20 */ | |
21 | |
22 #include "config.h" | |
23 | |
24 #ifdef ARCH_X86 | |
25 | |
26 #include <inttypes.h> | |
27 | |
28 #include "mpeg2_internal.h" | |
29 #include "attributes.h" | |
30 #include "mmx.h" | |
31 | |
32 #define CPU_MMXEXT 0 | |
33 #define CPU_3DNOW 1 | |
34 | |
35 | |
36 //MMX code - needs a rewrite | |
37 | |
38 | |
39 | |
40 | |
41 | |
42 | |
43 | |
44 // some rounding constants | |
45 mmx_t round1 = {0x0001000100010001LL}; | |
46 mmx_t round4 = {0x0002000200020002LL}; | |
47 | |
48 /* | |
49 * This code should probably be compiled with loop unrolling | |
50 * (ie, -funroll-loops in gcc)becuase some of the loops | |
51 * use a small static number of iterations. This was written | |
52 * with the assumption the compiler knows best about when | |
53 * unrolling will help | |
54 */ | |
55 | |
56 static inline void mmx_zero_reg () | |
57 { | |
58 // load 0 into mm0 | |
59 pxor_r2r (mm0, mm0); | |
60 } | |
61 | |
62 static inline void mmx_average_2_U8 (uint8_t * dest, | |
63 uint8_t * src1, uint8_t * src2) | |
64 { | |
65 // | |
66 // *dest = (*src1 + *src2 + 1)/ 2; | |
67 // | |
68 | |
69 movq_m2r (*src1, mm1); // load 8 src1 bytes | |
70 movq_r2r (mm1, mm2); // copy 8 src1 bytes | |
71 | |
72 movq_m2r (*src2, mm3); // load 8 src2 bytes | |
73 movq_r2r (mm3, mm4); // copy 8 src2 bytes | |
74 | |
75 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes | |
76 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes | |
77 | |
78 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes | |
79 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes | |
80 | |
81 paddw_r2r (mm3, mm1); // add lows to mm1 | |
82 paddw_m2r (round1, mm1); | |
83 psraw_i2r (1, mm1); // /2 | |
84 | |
85 paddw_r2r (mm4, mm2); // add highs to mm2 | |
86 paddw_m2r (round1, mm2); | |
87 psraw_i2r (1, mm2); // /2 | |
88 | |
89 packuswb_r2r (mm2, mm1); // pack (w/ saturation) | |
90 movq_r2m (mm1, *dest); // store result in dest | |
91 } | |
92 | |
93 static inline void mmx_interp_average_2_U8 (uint8_t * dest, | |
94 uint8_t * src1, uint8_t * src2) | |
95 { | |
96 // | |
97 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; | |
98 // | |
99 | |
100 movq_m2r (*dest, mm1); // load 8 dest bytes | |
101 movq_r2r (mm1, mm2); // copy 8 dest bytes | |
102 | |
103 movq_m2r (*src1, mm3); // load 8 src1 bytes | |
104 movq_r2r (mm3, mm4); // copy 8 src1 bytes | |
105 | |
106 movq_m2r (*src2, mm5); // load 8 src2 bytes | |
107 movq_r2r (mm5, mm6); // copy 8 src2 bytes | |
108 | |
109 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes | |
110 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes | |
111 | |
112 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes | |
113 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes | |
114 | |
115 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes | |
116 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes | |
117 | |
118 paddw_r2r (mm5, mm3); // add lows | |
119 paddw_m2r (round1, mm3); | |
120 psraw_i2r (1, mm3); // /2 | |
121 | |
122 paddw_r2r (mm6, mm4); // add highs | |
123 paddw_m2r (round1, mm4); | |
124 psraw_i2r (1, mm4); // /2 | |
125 | |
126 paddw_r2r (mm3, mm1); // add lows | |
127 paddw_m2r (round1, mm1); | |
128 psraw_i2r (1, mm1); // /2 | |
129 | |
130 paddw_r2r (mm4, mm2); // add highs | |
131 paddw_m2r (round1, mm2); | |
132 psraw_i2r (1, mm2); // /2 | |
133 | |
134 packuswb_r2r (mm2, mm1); // pack (w/ saturation) | |
135 movq_r2m (mm1, *dest); // store result in dest | |
136 } | |
137 | |
138 static inline void mmx_average_4_U8 (uint8_t * dest, | |
139 uint8_t * src1, uint8_t * src2, | |
140 uint8_t * src3, uint8_t * src4) | |
141 { | |
142 // | |
143 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; | |
144 // | |
145 | |
146 movq_m2r (*src1, mm1); // load 8 src1 bytes | |
147 movq_r2r (mm1, mm2); // copy 8 src1 bytes | |
148 | |
149 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes | |
150 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes | |
151 | |
152 movq_m2r (*src2, mm3); // load 8 src2 bytes | |
153 movq_r2r (mm3, mm4); // copy 8 src2 bytes | |
154 | |
155 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes | |
156 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes | |
157 | |
158 paddw_r2r (mm3, mm1); // add lows | |
159 paddw_r2r (mm4, mm2); // add highs | |
160 | |
161 // now have partials in mm1 and mm2 | |
162 | |
163 movq_m2r (*src3, mm3); // load 8 src3 bytes | |
164 movq_r2r (mm3, mm4); // copy 8 src3 bytes | |
165 | |
166 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes | |
167 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes | |
168 | |
169 paddw_r2r (mm3, mm1); // add lows | |
170 paddw_r2r (mm4, mm2); // add highs | |
171 | |
172 movq_m2r (*src4, mm5); // load 8 src4 bytes | |
173 movq_r2r (mm5, mm6); // copy 8 src4 bytes | |
174 | |
175 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes | |
176 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes | |
177 | |
178 paddw_r2r (mm5, mm1); // add lows | |
179 paddw_r2r (mm6, mm2); // add highs | |
180 | |
181 // now have subtotal in mm1 and mm2 | |
182 | |
183 paddw_m2r (round4, mm1); | |
184 psraw_i2r (2, mm1); // /4 | |
185 paddw_m2r (round4, mm2); | |
186 psraw_i2r (2, mm2); // /4 | |
187 | |
188 packuswb_r2r (mm2, mm1); // pack (w/ saturation) | |
189 movq_r2m (mm1, *dest); // store result in dest | |
190 } | |
191 | |
192 static inline void mmx_interp_average_4_U8 (uint8_t * dest, | |
193 uint8_t * src1, uint8_t * src2, | |
194 uint8_t * src3, uint8_t * src4) | |
195 { | |
196 // | |
197 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; | |
198 // | |
199 | |
200 movq_m2r (*src1, mm1); // load 8 src1 bytes | |
201 movq_r2r (mm1, mm2); // copy 8 src1 bytes | |
202 | |
203 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes | |
204 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes | |
205 | |
206 movq_m2r (*src2, mm3); // load 8 src2 bytes | |
207 movq_r2r (mm3, mm4); // copy 8 src2 bytes | |
208 | |
209 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes | |
210 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes | |
211 | |
212 paddw_r2r (mm3, mm1); // add lows | |
213 paddw_r2r (mm4, mm2); // add highs | |
214 | |
215 // now have partials in mm1 and mm2 | |
216 | |
217 movq_m2r (*src3, mm3); // load 8 src3 bytes | |
218 movq_r2r (mm3, mm4); // copy 8 src3 bytes | |
219 | |
220 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes | |
221 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes | |
222 | |
223 paddw_r2r (mm3, mm1); // add lows | |
224 paddw_r2r (mm4, mm2); // add highs | |
225 | |
226 movq_m2r (*src4, mm5); // load 8 src4 bytes | |
227 movq_r2r (mm5, mm6); // copy 8 src4 bytes | |
228 | |
229 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes | |
230 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes | |
231 | |
232 paddw_r2r (mm5, mm1); // add lows | |
233 paddw_r2r (mm6, mm2); // add highs | |
234 | |
235 paddw_m2r (round4, mm1); | |
236 psraw_i2r (2, mm1); // /4 | |
237 paddw_m2r (round4, mm2); | |
238 psraw_i2r (2, mm2); // /4 | |
239 | |
240 // now have subtotal/4 in mm1 and mm2 | |
241 | |
242 movq_m2r (*dest, mm3); // load 8 dest bytes | |
243 movq_r2r (mm3, mm4); // copy 8 dest bytes | |
244 | |
245 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes | |
246 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes | |
247 | |
248 paddw_r2r (mm3, mm1); // add lows | |
249 paddw_r2r (mm4, mm2); // add highs | |
250 | |
251 paddw_m2r (round1, mm1); | |
252 psraw_i2r (1, mm1); // /2 | |
253 paddw_m2r (round1, mm2); | |
254 psraw_i2r (1, mm2); // /2 | |
255 | |
256 // now have end value in mm1 and mm2 | |
257 | |
258 packuswb_r2r (mm2, mm1); // pack (w/ saturation) | |
259 movq_r2m (mm1,*dest); // store result in dest | |
260 } | |
261 | |
262 //----------------------------------------------------------------------- | |
263 | |
264 static inline void MC_avg_mmx (int width, int height, | |
265 uint8_t * dest, uint8_t * ref, int stride) | |
266 { | |
267 mmx_zero_reg (); | |
268 | |
269 do { | |
270 mmx_average_2_U8 (dest, dest, ref); | |
271 | |
272 if (width == 16) | |
273 mmx_average_2_U8 (dest+8, dest+8, ref+8); | |
274 | |
275 dest += stride; | |
276 ref += stride; | |
277 } while (--height); | |
278 } | |
279 | |
280 static void MC_avg_16_mmx (uint8_t * dest, uint8_t * ref, | |
281 int stride, int height) | |
282 { | |
283 MC_avg_mmx (16, height, dest, ref, stride); | |
284 } | |
285 | |
286 static void MC_avg_8_mmx (uint8_t * dest, uint8_t * ref, | |
287 int stride, int height) | |
288 { | |
289 MC_avg_mmx (8, height, dest, ref, stride); | |
290 } | |
291 | |
292 //----------------------------------------------------------------------- | |
293 | |
294 static inline void MC_put_mmx (int width, int height, | |
295 uint8_t * dest, uint8_t * ref, int stride) | |
296 { | |
297 mmx_zero_reg (); | |
298 | |
299 do { | |
300 movq_m2r (* ref, mm1); // load 8 ref bytes | |
301 movq_r2m (mm1,* dest); // store 8 bytes at curr | |
302 | |
303 if (width == 16) | |
304 { | |
305 movq_m2r (* (ref+8), mm1); // load 8 ref bytes | |
306 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr | |
307 } | |
308 | |
309 dest += stride; | |
310 ref += stride; | |
311 } while (--height); | |
312 } | |
313 | |
314 static void MC_put_16_mmx (uint8_t * dest, uint8_t * ref, | |
315 int stride, int height) | |
316 { | |
317 MC_put_mmx (16, height, dest, ref, stride); | |
318 } | |
319 | |
320 static void MC_put_8_mmx (uint8_t * dest, uint8_t * ref, | |
321 int stride, int height) | |
322 { | |
323 MC_put_mmx (8, height, dest, ref, stride); | |
324 } | |
325 | |
326 //----------------------------------------------------------------------- | |
327 | |
328 // Half pixel interpolation in the x direction | |
329 static inline void MC_avg_x_mmx (int width, int height, | |
330 uint8_t * dest, uint8_t * ref, int stride) | |
331 { | |
332 mmx_zero_reg (); | |
333 | |
334 do { | |
335 mmx_interp_average_2_U8 (dest, ref, ref+1); | |
336 | |
337 if (width == 16) | |
338 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9); | |
339 | |
340 dest += stride; | |
341 ref += stride; | |
342 } while (--height); | |
343 } | |
344 | |
345 static void MC_avg_x16_mmx (uint8_t * dest, uint8_t * ref, | |
346 int stride, int height) | |
347 { | |
348 MC_avg_x_mmx (16, height, dest, ref, stride); | |
349 } | |
350 | |
351 static void MC_avg_x8_mmx (uint8_t * dest, uint8_t * ref, | |
352 int stride, int height) | |
353 { | |
354 MC_avg_x_mmx (8, height, dest, ref, stride); | |
355 } | |
356 | |
357 //----------------------------------------------------------------------- | |
358 | |
359 static inline void MC_put_x_mmx (int width, int height, | |
360 uint8_t * dest, uint8_t * ref, int stride) | |
361 { | |
362 mmx_zero_reg (); | |
363 | |
364 do { | |
365 mmx_average_2_U8 (dest, ref, ref+1); | |
366 | |
367 if (width == 16) | |
368 mmx_average_2_U8 (dest+8, ref+8, ref+9); | |
369 | |
370 dest += stride; | |
371 ref += stride; | |
372 } while (--height); | |
373 } | |
374 | |
375 static void MC_put_x16_mmx (uint8_t * dest, uint8_t * ref, | |
376 int stride, int height) | |
377 { | |
378 MC_put_x_mmx (16, height, dest, ref, stride); | |
379 } | |
380 | |
381 static void MC_put_x8_mmx (uint8_t * dest, uint8_t * ref, | |
382 int stride, int height) | |
383 { | |
384 MC_put_x_mmx (8, height, dest, ref, stride); | |
385 } | |
386 | |
387 //----------------------------------------------------------------------- | |
388 | |
389 static inline void MC_avg_xy_mmx (int width, int height, | |
390 uint8_t * dest, uint8_t * ref, int stride) | |
391 { | |
392 uint8_t * ref_next = ref+stride; | |
393 | |
394 mmx_zero_reg (); | |
395 | |
396 do { | |
397 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
398 | |
399 if (width == 16) | |
400 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9, | |
401 ref_next+8, ref_next+9); | |
402 | |
403 dest += stride; | |
404 ref += stride; | |
405 ref_next += stride; | |
406 } while (--height); | |
407 } | |
408 | |
409 static void MC_avg_xy16_mmx (uint8_t * dest, uint8_t * ref, | |
410 int stride, int height) | |
411 { | |
412 MC_avg_xy_mmx (16, height, dest, ref, stride); | |
413 } | |
414 | |
415 static void MC_avg_xy8_mmx (uint8_t * dest, uint8_t * ref, | |
416 int stride, int height) | |
417 { | |
418 MC_avg_xy_mmx (8, height, dest, ref, stride); | |
419 } | |
420 | |
421 //----------------------------------------------------------------------- | |
422 | |
423 static inline void MC_put_xy_mmx (int width, int height, | |
424 uint8_t * dest, uint8_t * ref, int stride) | |
425 { | |
426 uint8_t * ref_next = ref+stride; | |
427 | |
428 mmx_zero_reg (); | |
429 | |
430 do { | |
431 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
432 | |
433 if (width == 16) | |
434 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9); | |
435 | |
436 dest += stride; | |
437 ref += stride; | |
438 ref_next += stride; | |
439 } while (--height); | |
440 } | |
441 | |
442 static void MC_put_xy16_mmx (uint8_t * dest, uint8_t * ref, | |
443 int stride, int height) | |
444 { | |
445 MC_put_xy_mmx (16, height, dest, ref, stride); | |
446 } | |
447 | |
448 static void MC_put_xy8_mmx (uint8_t * dest, uint8_t * ref, | |
449 int stride, int height) | |
450 { | |
451 MC_put_xy_mmx (8, height, dest, ref, stride); | |
452 } | |
453 | |
454 //----------------------------------------------------------------------- | |
455 | |
456 static inline void MC_avg_y_mmx (int width, int height, | |
457 uint8_t * dest, uint8_t * ref, int stride) | |
458 { | |
459 uint8_t * ref_next = ref+stride; | |
460 | |
461 mmx_zero_reg (); | |
462 | |
463 do { | |
464 mmx_interp_average_2_U8 (dest, ref, ref_next); | |
465 | |
466 if (width == 16) | |
467 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8); | |
468 | |
469 dest += stride; | |
470 ref += stride; | |
471 ref_next += stride; | |
472 } while (--height); | |
473 } | |
474 | |
475 static void MC_avg_y16_mmx (uint8_t * dest, uint8_t * ref, | |
476 int stride, int height) | |
477 { | |
478 MC_avg_y_mmx (16, height, dest, ref, stride); | |
479 } | |
480 | |
481 static void MC_avg_y8_mmx (uint8_t * dest, uint8_t * ref, | |
482 int stride, int height) | |
483 { | |
484 MC_avg_y_mmx (8, height, dest, ref, stride); | |
485 } | |
486 | |
487 //----------------------------------------------------------------------- | |
488 | |
489 static inline void MC_put_y_mmx (int width, int height, | |
490 uint8_t * dest, uint8_t * ref, int stride) | |
491 { | |
492 uint8_t * ref_next = ref+stride; | |
493 | |
494 mmx_zero_reg (); | |
495 | |
496 do { | |
497 mmx_average_2_U8 (dest, ref, ref_next); | |
498 | |
499 if (width == 16) | |
500 mmx_average_2_U8 (dest+8, ref+8, ref_next+8); | |
501 | |
502 dest += stride; | |
503 ref += stride; | |
504 ref_next += stride; | |
505 } while (--height); | |
506 } | |
507 | |
508 static void MC_put_y16_mmx (uint8_t * dest, uint8_t * ref, | |
509 int stride, int height) | |
510 { | |
511 MC_put_y_mmx (16, height, dest, ref, stride); | |
512 } | |
513 | |
514 static void MC_put_y8_mmx (uint8_t * dest, uint8_t * ref, | |
515 int stride, int height) | |
516 { | |
517 MC_put_y_mmx (8, height, dest, ref, stride); | |
518 } | |
519 | |
520 | |
521 MOTION_COMP_EXTERN (mmx) | |
522 | |
523 | |
524 | |
525 | |
526 | |
527 | |
528 | |
529 //CPU_MMXEXT/CPU_3DNOW adaptation layer | |
530 | |
531 #define pavg_r2r(src,dest) \ | |
532 do { \ | |
533 if (cpu == CPU_MMXEXT) \ | |
534 pavgb_r2r (src, dest); \ | |
535 else \ | |
536 pavgusb_r2r (src, dest); \ | |
537 } while (0) | |
538 | |
539 #define pavg_m2r(src,dest) \ | |
540 do { \ | |
541 if (cpu == CPU_MMXEXT) \ | |
542 pavgb_m2r (src, dest); \ | |
543 else \ | |
544 pavgusb_m2r (src, dest); \ | |
545 } while (0) | |
546 | |
547 | |
548 //CPU_MMXEXT code | |
549 | |
550 | |
551 static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref, | |
552 int stride) | |
553 { | |
554 do { | |
555 movq_m2r (*ref, mm0); | |
556 movq_r2m (mm0, *dest); | |
557 ref += stride; | |
558 dest += stride; | |
559 } while (--height); | |
560 } | |
561 | |
562 static inline void MC_put1_16 (int height, uint8_t * dest, uint8_t * ref, | |
563 int stride) | |
564 { | |
565 do { | |
566 movq_m2r (*ref, mm0); | |
567 movq_m2r (*(ref+8), mm1); | |
568 ref += stride; | |
569 movq_r2m (mm0, *dest); | |
570 movq_r2m (mm1, *(dest+8)); | |
571 dest += stride; | |
572 } while (--height); | |
573 } | |
574 | |
575 static inline void MC_avg1_8 (int height, uint8_t * dest, uint8_t * ref, | |
576 int stride, int cpu) | |
577 { | |
578 do { | |
579 movq_m2r (*ref, mm0); | |
580 pavg_m2r (*dest, mm0); | |
581 ref += stride; | |
582 movq_r2m (mm0, *dest); | |
583 dest += stride; | |
584 } while (--height); | |
585 } | |
586 | |
587 static inline void MC_avg1_16 (int height, uint8_t * dest, uint8_t * ref, | |
588 int stride, int cpu) | |
589 { | |
590 do { | |
591 movq_m2r (*ref, mm0); | |
592 movq_m2r (*(ref+8), mm1); | |
593 pavg_m2r (*dest, mm0); | |
594 pavg_m2r (*(dest+8), mm1); | |
595 movq_r2m (mm0, *dest); | |
596 ref += stride; | |
597 movq_r2m (mm1, *(dest+8)); | |
598 dest += stride; | |
599 } while (--height); | |
600 } | |
601 | |
602 static inline void MC_put2_8 (int height, uint8_t * dest, uint8_t * ref, | |
603 int stride, int offset, int cpu) | |
604 { | |
605 do { | |
606 movq_m2r (*ref, mm0); | |
607 pavg_m2r (*(ref+offset), mm0); | |
608 ref += stride; | |
609 movq_r2m (mm0, *dest); | |
610 dest += stride; | |
611 } while (--height); | |
612 } | |
613 | |
614 static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref, | |
615 int stride, int offset, int cpu) | |
616 { | |
617 do { | |
618 movq_m2r (*ref, mm0); | |
619 movq_m2r (*(ref+8), mm1); | |
620 pavg_m2r (*(ref+offset), mm0); | |
621 pavg_m2r (*(ref+offset+8), mm1); | |
622 movq_r2m (mm0, *dest); | |
623 ref += stride; | |
624 movq_r2m (mm1, *(dest+8)); | |
625 dest += stride; | |
626 } while (--height); | |
627 } | |
628 | |
629 static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref, | |
630 int stride, int offset, int cpu) | |
631 { | |
632 do { | |
633 movq_m2r (*ref, mm0); | |
634 pavg_m2r (*(ref+offset), mm0); | |
635 pavg_m2r (*dest, mm0); | |
636 ref += stride; | |
637 movq_r2m (mm0, *dest); | |
638 dest += stride; | |
639 } while (--height); | |
640 } | |
641 | |
642 static inline void MC_avg2_16 (int height, uint8_t * dest, uint8_t * ref, | |
643 int stride, int offset, int cpu) | |
644 { | |
645 do { | |
646 movq_m2r (*ref, mm0); | |
647 movq_m2r (*(ref+8), mm1); | |
648 pavg_m2r (*(ref+offset), mm0); | |
649 pavg_m2r (*(ref+offset+8), mm1); | |
650 pavg_m2r (*dest, mm0); | |
651 pavg_m2r (*(dest+8), mm1); | |
652 ref += stride; | |
653 movq_r2m (mm0, *dest); | |
654 movq_r2m (mm1, *(dest+8)); | |
655 dest += stride; | |
656 } while (--height); | |
657 } | |
658 | |
659 static mmx_t mask_one = {0x0101010101010101LL}; | |
660 | |
661 static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref, | |
662 int stride, int cpu) | |
663 { | |
664 movq_m2r (*ref, mm0); | |
665 movq_m2r (*(ref+1), mm1); | |
666 movq_r2r (mm0, mm7); | |
667 pxor_r2r (mm1, mm7); | |
668 pavg_r2r (mm1, mm0); | |
669 ref += stride; | |
670 | |
671 do { | |
672 movq_m2r (*ref, mm2); | |
673 movq_r2r (mm0, mm5); | |
674 | |
675 movq_m2r (*(ref+1), mm3); | |
676 movq_r2r (mm2, mm6); | |
677 | |
678 pxor_r2r (mm3, mm6); | |
679 pavg_r2r (mm3, mm2); | |
680 | |
681 por_r2r (mm6, mm7); | |
682 pxor_r2r (mm2, mm5); | |
683 | |
684 pand_r2r (mm5, mm7); | |
685 pavg_r2r (mm2, mm0); | |
686 | |
687 pand_m2r (mask_one, mm7); | |
688 | |
689 psubusb_r2r (mm7, mm0); | |
690 | |
691 ref += stride; | |
692 movq_r2m (mm0, *dest); | |
693 dest += stride; | |
694 | |
695 movq_r2r (mm6, mm7); // unroll ! | |
696 movq_r2r (mm2, mm0); // unroll ! | |
697 } while (--height); | |
698 } | |
699 | |
700 static inline void MC_put4_16 (int height, uint8_t * dest, uint8_t * ref, | |
701 int stride, int cpu) | |
702 { | |
703 do { | |
704 movq_m2r (*ref, mm0); | |
705 movq_m2r (*(ref+stride+1), mm1); | |
706 movq_r2r (mm0, mm7); | |
707 movq_m2r (*(ref+1), mm2); | |
708 pxor_r2r (mm1, mm7); | |
709 movq_m2r (*(ref+stride), mm3); | |
710 movq_r2r (mm2, mm6); | |
711 pxor_r2r (mm3, mm6); | |
712 pavg_r2r (mm1, mm0); | |
713 pavg_r2r (mm3, mm2); | |
714 por_r2r (mm6, mm7); | |
715 movq_r2r (mm0, mm6); | |
716 pxor_r2r (mm2, mm6); | |
717 pand_r2r (mm6, mm7); | |
718 pand_m2r (mask_one, mm7); | |
719 pavg_r2r (mm2, mm0); | |
720 psubusb_r2r (mm7, mm0); | |
721 movq_r2m (mm0, *dest); | |
722 | |
723 movq_m2r (*(ref+8), mm0); | |
724 movq_m2r (*(ref+stride+9), mm1); | |
725 movq_r2r (mm0, mm7); | |
726 movq_m2r (*(ref+9), mm2); | |
727 pxor_r2r (mm1, mm7); | |
728 movq_m2r (*(ref+stride+8), mm3); | |
729 movq_r2r (mm2, mm6); | |
730 pxor_r2r (mm3, mm6); | |
731 pavg_r2r (mm1, mm0); | |
732 pavg_r2r (mm3, mm2); | |
733 por_r2r (mm6, mm7); | |
734 movq_r2r (mm0, mm6); | |
735 pxor_r2r (mm2, mm6); | |
736 pand_r2r (mm6, mm7); | |
737 pand_m2r (mask_one, mm7); | |
738 pavg_r2r (mm2, mm0); | |
739 psubusb_r2r (mm7, mm0); | |
740 ref += stride; | |
741 movq_r2m (mm0, *(dest+8)); | |
742 dest += stride; | |
743 } while (--height); | |
744 } | |
745 | |
746 static inline void MC_avg4_8 (int height, uint8_t * dest, uint8_t * ref, | |
747 int stride, int cpu) | |
748 { | |
749 do { | |
750 movq_m2r (*ref, mm0); | |
751 movq_m2r (*(ref+stride+1), mm1); | |
752 movq_r2r (mm0, mm7); | |
753 movq_m2r (*(ref+1), mm2); | |
754 pxor_r2r (mm1, mm7); | |
755 movq_m2r (*(ref+stride), mm3); | |
756 movq_r2r (mm2, mm6); | |
757 pxor_r2r (mm3, mm6); | |
758 pavg_r2r (mm1, mm0); | |
759 pavg_r2r (mm3, mm2); | |
760 por_r2r (mm6, mm7); | |
761 movq_r2r (mm0, mm6); | |
762 pxor_r2r (mm2, mm6); | |
763 pand_r2r (mm6, mm7); | |
764 pand_m2r (mask_one, mm7); | |
765 pavg_r2r (mm2, mm0); | |
766 psubusb_r2r (mm7, mm0); | |
767 movq_m2r (*dest, mm1); | |
768 pavg_r2r (mm1, mm0); | |
769 ref += stride; | |
770 movq_r2m (mm0, *dest); | |
771 dest += stride; | |
772 } while (--height); | |
773 } | |
774 | |
775 static inline void MC_avg4_16 (int height, uint8_t * dest, uint8_t * ref, | |
776 int stride, int cpu) | |
777 { | |
778 do { | |
779 movq_m2r (*ref, mm0); | |
780 movq_m2r (*(ref+stride+1), mm1); | |
781 movq_r2r (mm0, mm7); | |
782 movq_m2r (*(ref+1), mm2); | |
783 pxor_r2r (mm1, mm7); | |
784 movq_m2r (*(ref+stride), mm3); | |
785 movq_r2r (mm2, mm6); | |
786 pxor_r2r (mm3, mm6); | |
787 pavg_r2r (mm1, mm0); | |
788 pavg_r2r (mm3, mm2); | |
789 por_r2r (mm6, mm7); | |
790 movq_r2r (mm0, mm6); | |
791 pxor_r2r (mm2, mm6); | |
792 pand_r2r (mm6, mm7); | |
793 pand_m2r (mask_one, mm7); | |
794 pavg_r2r (mm2, mm0); | |
795 psubusb_r2r (mm7, mm0); | |
796 movq_m2r (*dest, mm1); | |
797 pavg_r2r (mm1, mm0); | |
798 movq_r2m (mm0, *dest); | |
799 | |
800 movq_m2r (*(ref+8), mm0); | |
801 movq_m2r (*(ref+stride+9), mm1); | |
802 movq_r2r (mm0, mm7); | |
803 movq_m2r (*(ref+9), mm2); | |
804 pxor_r2r (mm1, mm7); | |
805 movq_m2r (*(ref+stride+8), mm3); | |
806 movq_r2r (mm2, mm6); | |
807 pxor_r2r (mm3, mm6); | |
808 pavg_r2r (mm1, mm0); | |
809 pavg_r2r (mm3, mm2); | |
810 por_r2r (mm6, mm7); | |
811 movq_r2r (mm0, mm6); | |
812 pxor_r2r (mm2, mm6); | |
813 pand_r2r (mm6, mm7); | |
814 pand_m2r (mask_one, mm7); | |
815 pavg_r2r (mm2, mm0); | |
816 psubusb_r2r (mm7, mm0); | |
817 movq_m2r (*(dest+8), mm1); | |
818 pavg_r2r (mm1, mm0); | |
819 ref += stride; | |
820 movq_r2m (mm0, *(dest+8)); | |
821 dest += stride; | |
822 } while (--height); | |
823 } | |
824 | |
825 static void MC_avg_16_mmxext (uint8_t * dest, uint8_t * ref, | |
826 int stride, int height) | |
827 { | |
828 MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT); | |
829 } | |
830 | |
831 static void MC_avg_8_mmxext (uint8_t * dest, uint8_t * ref, | |
832 int stride, int height) | |
833 { | |
834 MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT); | |
835 } | |
836 | |
837 static void MC_put_16_mmxext (uint8_t * dest, uint8_t * ref, | |
838 int stride, int height) | |
839 { | |
840 MC_put1_16 (height, dest, ref, stride); | |
841 } | |
842 | |
843 static void MC_put_8_mmxext (uint8_t * dest, uint8_t * ref, | |
844 int stride, int height) | |
845 { | |
846 MC_put1_8 (height, dest, ref, stride); | |
847 } | |
848 | |
849 static void MC_avg_x16_mmxext (uint8_t * dest, uint8_t * ref, | |
850 int stride, int height) | |
851 { | |
852 MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
853 } | |
854 | |
855 static void MC_avg_x8_mmxext (uint8_t * dest, uint8_t * ref, | |
856 int stride, int height) | |
857 { | |
858 MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
859 } | |
860 | |
861 static void MC_put_x16_mmxext (uint8_t * dest, uint8_t * ref, | |
862 int stride, int height) | |
863 { | |
864 MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
865 } | |
866 | |
867 static void MC_put_x8_mmxext (uint8_t * dest, uint8_t * ref, | |
868 int stride, int height) | |
869 { | |
870 MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
871 } | |
872 | |
873 static void MC_avg_y16_mmxext (uint8_t * dest, uint8_t * ref, | |
874 int stride, int height) | |
875 { | |
876 MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
877 } | |
878 | |
879 static void MC_avg_y8_mmxext (uint8_t * dest, uint8_t * ref, | |
880 int stride, int height) | |
881 { | |
882 MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
883 } | |
884 | |
885 static void MC_put_y16_mmxext (uint8_t * dest, uint8_t * ref, | |
886 int stride, int height) | |
887 { | |
888 MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
889 } | |
890 | |
891 static void MC_put_y8_mmxext (uint8_t * dest, uint8_t * ref, | |
892 int stride, int height) | |
893 { | |
894 MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
895 } | |
896 | |
897 static void MC_avg_xy16_mmxext (uint8_t * dest, uint8_t * ref, | |
898 int stride, int height) | |
899 { | |
900 MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
901 } | |
902 | |
903 static void MC_avg_xy8_mmxext (uint8_t * dest, uint8_t * ref, | |
904 int stride, int height) | |
905 { | |
906 MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
907 } | |
908 | |
909 static void MC_put_xy16_mmxext (uint8_t * dest, uint8_t * ref, | |
910 int stride, int height) | |
911 { | |
912 MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
913 } | |
914 | |
915 static void MC_put_xy8_mmxext (uint8_t * dest, uint8_t * ref, | |
916 int stride, int height) | |
917 { | |
918 MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
919 } | |
920 | |
921 | |
922 MOTION_COMP_EXTERN (mmxext) | |
923 | |
924 | |
925 | |
926 static void MC_avg_16_3dnow (uint8_t * dest, uint8_t * ref, | |
927 int stride, int height) | |
928 { | |
929 MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW); | |
930 } | |
931 | |
932 static void MC_avg_8_3dnow (uint8_t * dest, uint8_t * ref, | |
933 int stride, int height) | |
934 { | |
935 MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW); | |
936 } | |
937 | |
938 static void MC_put_16_3dnow (uint8_t * dest, uint8_t * ref, | |
939 int stride, int height) | |
940 { | |
941 MC_put1_16 (height, dest, ref, stride); | |
942 } | |
943 | |
944 static void MC_put_8_3dnow (uint8_t * dest, uint8_t * ref, | |
945 int stride, int height) | |
946 { | |
947 MC_put1_8 (height, dest, ref, stride); | |
948 } | |
949 | |
950 static void MC_avg_x16_3dnow (uint8_t * dest, uint8_t * ref, | |
951 int stride, int height) | |
952 { | |
953 MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
954 } | |
955 | |
956 static void MC_avg_x8_3dnow (uint8_t * dest, uint8_t * ref, | |
957 int stride, int height) | |
958 { | |
959 MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
960 } | |
961 | |
962 static void MC_put_x16_3dnow (uint8_t * dest, uint8_t * ref, | |
963 int stride, int height) | |
964 { | |
965 MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
966 } | |
967 | |
968 static void MC_put_x8_3dnow (uint8_t * dest, uint8_t * ref, | |
969 int stride, int height) | |
970 { | |
971 MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
972 } | |
973 | |
974 static void MC_avg_y16_3dnow (uint8_t * dest, uint8_t * ref, | |
975 int stride, int height) | |
976 { | |
977 MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
978 } | |
979 | |
980 static void MC_avg_y8_3dnow (uint8_t * dest, uint8_t * ref, | |
981 int stride, int height) | |
982 { | |
983 MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
984 } | |
985 | |
986 static void MC_put_y16_3dnow (uint8_t * dest, uint8_t * ref, | |
987 int stride, int height) | |
988 { | |
989 MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
990 } | |
991 | |
992 static void MC_put_y8_3dnow (uint8_t * dest, uint8_t * ref, | |
993 int stride, int height) | |
994 { | |
995 MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
996 } | |
997 | |
998 static void MC_avg_xy16_3dnow (uint8_t * dest, uint8_t * ref, | |
999 int stride, int height) | |
1000 { | |
1001 MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW); | |
1002 } | |
1003 | |
1004 static void MC_avg_xy8_3dnow (uint8_t * dest, uint8_t * ref, | |
1005 int stride, int height) | |
1006 { | |
1007 MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW); | |
1008 } | |
1009 | |
1010 static void MC_put_xy16_3dnow (uint8_t * dest, uint8_t * ref, | |
1011 int stride, int height) | |
1012 { | |
1013 MC_put4_16 (height, dest, ref, stride, CPU_3DNOW); | |
1014 } | |
1015 | |
1016 static void MC_put_xy8_3dnow (uint8_t * dest, uint8_t * ref, | |
1017 int stride, int height) | |
1018 { | |
1019 MC_put4_8 (height, dest, ref, stride, CPU_3DNOW); | |
1020 } | |
1021 | |
1022 | |
1023 MOTION_COMP_EXTERN (3dnow) | |
1024 | |
1025 #endif |