comparison libmpeg2/motion_comp_mmx.c @ 1:3b5f5d1c5041

Initial revision
author arpi_esp
date Sat, 24 Feb 2001 20:28:24 +0000
parents
children 846535ace7a2
comparison
equal deleted inserted replaced
0:c1bb2c071d63 1:3b5f5d1c5041
1 /*
2 * motion_comp_mmx.c
3 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
4 *
5 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
6 *
7 * mpeg2dec is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * mpeg2dec is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22 #include "config.h"
23
24 #ifdef ARCH_X86
25
26 #include <inttypes.h>
27
28 #include "mpeg2_internal.h"
29 #include "attributes.h"
30 #include "mmx.h"
31
32 #define CPU_MMXEXT 0
33 #define CPU_3DNOW 1
34
35
36 //MMX code - needs a rewrite
37
38
39
40
41
42
43
44 // some rounding constants
45 mmx_t round1 = {0x0001000100010001LL};
46 mmx_t round4 = {0x0002000200020002LL};
47
48 /*
49 * This code should probably be compiled with loop unrolling
50 * (ie, -funroll-loops in gcc)becuase some of the loops
51 * use a small static number of iterations. This was written
52 * with the assumption the compiler knows best about when
53 * unrolling will help
54 */
55
56 static inline void mmx_zero_reg ()
57 {
58 // load 0 into mm0
59 pxor_r2r (mm0, mm0);
60 }
61
62 static inline void mmx_average_2_U8 (uint8_t * dest,
63 uint8_t * src1, uint8_t * src2)
64 {
65 //
66 // *dest = (*src1 + *src2 + 1)/ 2;
67 //
68
69 movq_m2r (*src1, mm1); // load 8 src1 bytes
70 movq_r2r (mm1, mm2); // copy 8 src1 bytes
71
72 movq_m2r (*src2, mm3); // load 8 src2 bytes
73 movq_r2r (mm3, mm4); // copy 8 src2 bytes
74
75 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
76 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
77
78 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
79 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
80
81 paddw_r2r (mm3, mm1); // add lows to mm1
82 paddw_m2r (round1, mm1);
83 psraw_i2r (1, mm1); // /2
84
85 paddw_r2r (mm4, mm2); // add highs to mm2
86 paddw_m2r (round1, mm2);
87 psraw_i2r (1, mm2); // /2
88
89 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
90 movq_r2m (mm1, *dest); // store result in dest
91 }
92
93 static inline void mmx_interp_average_2_U8 (uint8_t * dest,
94 uint8_t * src1, uint8_t * src2)
95 {
96 //
97 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
98 //
99
100 movq_m2r (*dest, mm1); // load 8 dest bytes
101 movq_r2r (mm1, mm2); // copy 8 dest bytes
102
103 movq_m2r (*src1, mm3); // load 8 src1 bytes
104 movq_r2r (mm3, mm4); // copy 8 src1 bytes
105
106 movq_m2r (*src2, mm5); // load 8 src2 bytes
107 movq_r2r (mm5, mm6); // copy 8 src2 bytes
108
109 punpcklbw_r2r (mm0, mm1); // unpack low dest bytes
110 punpckhbw_r2r (mm0, mm2); // unpack high dest bytes
111
112 punpcklbw_r2r (mm0, mm3); // unpack low src1 bytes
113 punpckhbw_r2r (mm0, mm4); // unpack high src1 bytes
114
115 punpcklbw_r2r (mm0, mm5); // unpack low src2 bytes
116 punpckhbw_r2r (mm0, mm6); // unpack high src2 bytes
117
118 paddw_r2r (mm5, mm3); // add lows
119 paddw_m2r (round1, mm3);
120 psraw_i2r (1, mm3); // /2
121
122 paddw_r2r (mm6, mm4); // add highs
123 paddw_m2r (round1, mm4);
124 psraw_i2r (1, mm4); // /2
125
126 paddw_r2r (mm3, mm1); // add lows
127 paddw_m2r (round1, mm1);
128 psraw_i2r (1, mm1); // /2
129
130 paddw_r2r (mm4, mm2); // add highs
131 paddw_m2r (round1, mm2);
132 psraw_i2r (1, mm2); // /2
133
134 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
135 movq_r2m (mm1, *dest); // store result in dest
136 }
137
138 static inline void mmx_average_4_U8 (uint8_t * dest,
139 uint8_t * src1, uint8_t * src2,
140 uint8_t * src3, uint8_t * src4)
141 {
142 //
143 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
144 //
145
146 movq_m2r (*src1, mm1); // load 8 src1 bytes
147 movq_r2r (mm1, mm2); // copy 8 src1 bytes
148
149 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
150 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
151
152 movq_m2r (*src2, mm3); // load 8 src2 bytes
153 movq_r2r (mm3, mm4); // copy 8 src2 bytes
154
155 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
156 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
157
158 paddw_r2r (mm3, mm1); // add lows
159 paddw_r2r (mm4, mm2); // add highs
160
161 // now have partials in mm1 and mm2
162
163 movq_m2r (*src3, mm3); // load 8 src3 bytes
164 movq_r2r (mm3, mm4); // copy 8 src3 bytes
165
166 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
167 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
168
169 paddw_r2r (mm3, mm1); // add lows
170 paddw_r2r (mm4, mm2); // add highs
171
172 movq_m2r (*src4, mm5); // load 8 src4 bytes
173 movq_r2r (mm5, mm6); // copy 8 src4 bytes
174
175 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
176 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
177
178 paddw_r2r (mm5, mm1); // add lows
179 paddw_r2r (mm6, mm2); // add highs
180
181 // now have subtotal in mm1 and mm2
182
183 paddw_m2r (round4, mm1);
184 psraw_i2r (2, mm1); // /4
185 paddw_m2r (round4, mm2);
186 psraw_i2r (2, mm2); // /4
187
188 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
189 movq_r2m (mm1, *dest); // store result in dest
190 }
191
192 static inline void mmx_interp_average_4_U8 (uint8_t * dest,
193 uint8_t * src1, uint8_t * src2,
194 uint8_t * src3, uint8_t * src4)
195 {
196 //
197 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
198 //
199
200 movq_m2r (*src1, mm1); // load 8 src1 bytes
201 movq_r2r (mm1, mm2); // copy 8 src1 bytes
202
203 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
204 punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes
205
206 movq_m2r (*src2, mm3); // load 8 src2 bytes
207 movq_r2r (mm3, mm4); // copy 8 src2 bytes
208
209 punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes
210 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
211
212 paddw_r2r (mm3, mm1); // add lows
213 paddw_r2r (mm4, mm2); // add highs
214
215 // now have partials in mm1 and mm2
216
217 movq_m2r (*src3, mm3); // load 8 src3 bytes
218 movq_r2r (mm3, mm4); // copy 8 src3 bytes
219
220 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
221 punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes
222
223 paddw_r2r (mm3, mm1); // add lows
224 paddw_r2r (mm4, mm2); // add highs
225
226 movq_m2r (*src4, mm5); // load 8 src4 bytes
227 movq_r2r (mm5, mm6); // copy 8 src4 bytes
228
229 punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes
230 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
231
232 paddw_r2r (mm5, mm1); // add lows
233 paddw_r2r (mm6, mm2); // add highs
234
235 paddw_m2r (round4, mm1);
236 psraw_i2r (2, mm1); // /4
237 paddw_m2r (round4, mm2);
238 psraw_i2r (2, mm2); // /4
239
240 // now have subtotal/4 in mm1 and mm2
241
242 movq_m2r (*dest, mm3); // load 8 dest bytes
243 movq_r2r (mm3, mm4); // copy 8 dest bytes
244
245 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
246 punpckhbw_r2r (mm0, mm4); // unpack high dest bytes
247
248 paddw_r2r (mm3, mm1); // add lows
249 paddw_r2r (mm4, mm2); // add highs
250
251 paddw_m2r (round1, mm1);
252 psraw_i2r (1, mm1); // /2
253 paddw_m2r (round1, mm2);
254 psraw_i2r (1, mm2); // /2
255
256 // now have end value in mm1 and mm2
257
258 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
259 movq_r2m (mm1,*dest); // store result in dest
260 }
261
262 //-----------------------------------------------------------------------
263
264 static inline void MC_avg_mmx (int width, int height,
265 uint8_t * dest, uint8_t * ref, int stride)
266 {
267 mmx_zero_reg ();
268
269 do {
270 mmx_average_2_U8 (dest, dest, ref);
271
272 if (width == 16)
273 mmx_average_2_U8 (dest+8, dest+8, ref+8);
274
275 dest += stride;
276 ref += stride;
277 } while (--height);
278 }
279
280 static void MC_avg_16_mmx (uint8_t * dest, uint8_t * ref,
281 int stride, int height)
282 {
283 MC_avg_mmx (16, height, dest, ref, stride);
284 }
285
286 static void MC_avg_8_mmx (uint8_t * dest, uint8_t * ref,
287 int stride, int height)
288 {
289 MC_avg_mmx (8, height, dest, ref, stride);
290 }
291
292 //-----------------------------------------------------------------------
293
294 static inline void MC_put_mmx (int width, int height,
295 uint8_t * dest, uint8_t * ref, int stride)
296 {
297 mmx_zero_reg ();
298
299 do {
300 movq_m2r (* ref, mm1); // load 8 ref bytes
301 movq_r2m (mm1,* dest); // store 8 bytes at curr
302
303 if (width == 16)
304 {
305 movq_m2r (* (ref+8), mm1); // load 8 ref bytes
306 movq_r2m (mm1,* (dest+8)); // store 8 bytes at curr
307 }
308
309 dest += stride;
310 ref += stride;
311 } while (--height);
312 }
313
314 static void MC_put_16_mmx (uint8_t * dest, uint8_t * ref,
315 int stride, int height)
316 {
317 MC_put_mmx (16, height, dest, ref, stride);
318 }
319
320 static void MC_put_8_mmx (uint8_t * dest, uint8_t * ref,
321 int stride, int height)
322 {
323 MC_put_mmx (8, height, dest, ref, stride);
324 }
325
326 //-----------------------------------------------------------------------
327
328 // Half pixel interpolation in the x direction
329 static inline void MC_avg_x_mmx (int width, int height,
330 uint8_t * dest, uint8_t * ref, int stride)
331 {
332 mmx_zero_reg ();
333
334 do {
335 mmx_interp_average_2_U8 (dest, ref, ref+1);
336
337 if (width == 16)
338 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9);
339
340 dest += stride;
341 ref += stride;
342 } while (--height);
343 }
344
345 static void MC_avg_x16_mmx (uint8_t * dest, uint8_t * ref,
346 int stride, int height)
347 {
348 MC_avg_x_mmx (16, height, dest, ref, stride);
349 }
350
351 static void MC_avg_x8_mmx (uint8_t * dest, uint8_t * ref,
352 int stride, int height)
353 {
354 MC_avg_x_mmx (8, height, dest, ref, stride);
355 }
356
357 //-----------------------------------------------------------------------
358
359 static inline void MC_put_x_mmx (int width, int height,
360 uint8_t * dest, uint8_t * ref, int stride)
361 {
362 mmx_zero_reg ();
363
364 do {
365 mmx_average_2_U8 (dest, ref, ref+1);
366
367 if (width == 16)
368 mmx_average_2_U8 (dest+8, ref+8, ref+9);
369
370 dest += stride;
371 ref += stride;
372 } while (--height);
373 }
374
375 static void MC_put_x16_mmx (uint8_t * dest, uint8_t * ref,
376 int stride, int height)
377 {
378 MC_put_x_mmx (16, height, dest, ref, stride);
379 }
380
381 static void MC_put_x8_mmx (uint8_t * dest, uint8_t * ref,
382 int stride, int height)
383 {
384 MC_put_x_mmx (8, height, dest, ref, stride);
385 }
386
387 //-----------------------------------------------------------------------
388
389 static inline void MC_avg_xy_mmx (int width, int height,
390 uint8_t * dest, uint8_t * ref, int stride)
391 {
392 uint8_t * ref_next = ref+stride;
393
394 mmx_zero_reg ();
395
396 do {
397 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
398
399 if (width == 16)
400 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9,
401 ref_next+8, ref_next+9);
402
403 dest += stride;
404 ref += stride;
405 ref_next += stride;
406 } while (--height);
407 }
408
409 static void MC_avg_xy16_mmx (uint8_t * dest, uint8_t * ref,
410 int stride, int height)
411 {
412 MC_avg_xy_mmx (16, height, dest, ref, stride);
413 }
414
415 static void MC_avg_xy8_mmx (uint8_t * dest, uint8_t * ref,
416 int stride, int height)
417 {
418 MC_avg_xy_mmx (8, height, dest, ref, stride);
419 }
420
421 //-----------------------------------------------------------------------
422
423 static inline void MC_put_xy_mmx (int width, int height,
424 uint8_t * dest, uint8_t * ref, int stride)
425 {
426 uint8_t * ref_next = ref+stride;
427
428 mmx_zero_reg ();
429
430 do {
431 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1);
432
433 if (width == 16)
434 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9);
435
436 dest += stride;
437 ref += stride;
438 ref_next += stride;
439 } while (--height);
440 }
441
442 static void MC_put_xy16_mmx (uint8_t * dest, uint8_t * ref,
443 int stride, int height)
444 {
445 MC_put_xy_mmx (16, height, dest, ref, stride);
446 }
447
448 static void MC_put_xy8_mmx (uint8_t * dest, uint8_t * ref,
449 int stride, int height)
450 {
451 MC_put_xy_mmx (8, height, dest, ref, stride);
452 }
453
454 //-----------------------------------------------------------------------
455
456 static inline void MC_avg_y_mmx (int width, int height,
457 uint8_t * dest, uint8_t * ref, int stride)
458 {
459 uint8_t * ref_next = ref+stride;
460
461 mmx_zero_reg ();
462
463 do {
464 mmx_interp_average_2_U8 (dest, ref, ref_next);
465
466 if (width == 16)
467 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8);
468
469 dest += stride;
470 ref += stride;
471 ref_next += stride;
472 } while (--height);
473 }
474
475 static void MC_avg_y16_mmx (uint8_t * dest, uint8_t * ref,
476 int stride, int height)
477 {
478 MC_avg_y_mmx (16, height, dest, ref, stride);
479 }
480
481 static void MC_avg_y8_mmx (uint8_t * dest, uint8_t * ref,
482 int stride, int height)
483 {
484 MC_avg_y_mmx (8, height, dest, ref, stride);
485 }
486
487 //-----------------------------------------------------------------------
488
489 static inline void MC_put_y_mmx (int width, int height,
490 uint8_t * dest, uint8_t * ref, int stride)
491 {
492 uint8_t * ref_next = ref+stride;
493
494 mmx_zero_reg ();
495
496 do {
497 mmx_average_2_U8 (dest, ref, ref_next);
498
499 if (width == 16)
500 mmx_average_2_U8 (dest+8, ref+8, ref_next+8);
501
502 dest += stride;
503 ref += stride;
504 ref_next += stride;
505 } while (--height);
506 }
507
508 static void MC_put_y16_mmx (uint8_t * dest, uint8_t * ref,
509 int stride, int height)
510 {
511 MC_put_y_mmx (16, height, dest, ref, stride);
512 }
513
514 static void MC_put_y8_mmx (uint8_t * dest, uint8_t * ref,
515 int stride, int height)
516 {
517 MC_put_y_mmx (8, height, dest, ref, stride);
518 }
519
520
521 MOTION_COMP_EXTERN (mmx)
522
523
524
525
526
527
528
529 //CPU_MMXEXT/CPU_3DNOW adaptation layer
530
531 #define pavg_r2r(src,dest) \
532 do { \
533 if (cpu == CPU_MMXEXT) \
534 pavgb_r2r (src, dest); \
535 else \
536 pavgusb_r2r (src, dest); \
537 } while (0)
538
539 #define pavg_m2r(src,dest) \
540 do { \
541 if (cpu == CPU_MMXEXT) \
542 pavgb_m2r (src, dest); \
543 else \
544 pavgusb_m2r (src, dest); \
545 } while (0)
546
547
548 //CPU_MMXEXT code
549
550
551 static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref,
552 int stride)
553 {
554 do {
555 movq_m2r (*ref, mm0);
556 movq_r2m (mm0, *dest);
557 ref += stride;
558 dest += stride;
559 } while (--height);
560 }
561
562 static inline void MC_put1_16 (int height, uint8_t * dest, uint8_t * ref,
563 int stride)
564 {
565 do {
566 movq_m2r (*ref, mm0);
567 movq_m2r (*(ref+8), mm1);
568 ref += stride;
569 movq_r2m (mm0, *dest);
570 movq_r2m (mm1, *(dest+8));
571 dest += stride;
572 } while (--height);
573 }
574
575 static inline void MC_avg1_8 (int height, uint8_t * dest, uint8_t * ref,
576 int stride, int cpu)
577 {
578 do {
579 movq_m2r (*ref, mm0);
580 pavg_m2r (*dest, mm0);
581 ref += stride;
582 movq_r2m (mm0, *dest);
583 dest += stride;
584 } while (--height);
585 }
586
587 static inline void MC_avg1_16 (int height, uint8_t * dest, uint8_t * ref,
588 int stride, int cpu)
589 {
590 do {
591 movq_m2r (*ref, mm0);
592 movq_m2r (*(ref+8), mm1);
593 pavg_m2r (*dest, mm0);
594 pavg_m2r (*(dest+8), mm1);
595 movq_r2m (mm0, *dest);
596 ref += stride;
597 movq_r2m (mm1, *(dest+8));
598 dest += stride;
599 } while (--height);
600 }
601
602 static inline void MC_put2_8 (int height, uint8_t * dest, uint8_t * ref,
603 int stride, int offset, int cpu)
604 {
605 do {
606 movq_m2r (*ref, mm0);
607 pavg_m2r (*(ref+offset), mm0);
608 ref += stride;
609 movq_r2m (mm0, *dest);
610 dest += stride;
611 } while (--height);
612 }
613
614 static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref,
615 int stride, int offset, int cpu)
616 {
617 do {
618 movq_m2r (*ref, mm0);
619 movq_m2r (*(ref+8), mm1);
620 pavg_m2r (*(ref+offset), mm0);
621 pavg_m2r (*(ref+offset+8), mm1);
622 movq_r2m (mm0, *dest);
623 ref += stride;
624 movq_r2m (mm1, *(dest+8));
625 dest += stride;
626 } while (--height);
627 }
628
629 static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref,
630 int stride, int offset, int cpu)
631 {
632 do {
633 movq_m2r (*ref, mm0);
634 pavg_m2r (*(ref+offset), mm0);
635 pavg_m2r (*dest, mm0);
636 ref += stride;
637 movq_r2m (mm0, *dest);
638 dest += stride;
639 } while (--height);
640 }
641
642 static inline void MC_avg2_16 (int height, uint8_t * dest, uint8_t * ref,
643 int stride, int offset, int cpu)
644 {
645 do {
646 movq_m2r (*ref, mm0);
647 movq_m2r (*(ref+8), mm1);
648 pavg_m2r (*(ref+offset), mm0);
649 pavg_m2r (*(ref+offset+8), mm1);
650 pavg_m2r (*dest, mm0);
651 pavg_m2r (*(dest+8), mm1);
652 ref += stride;
653 movq_r2m (mm0, *dest);
654 movq_r2m (mm1, *(dest+8));
655 dest += stride;
656 } while (--height);
657 }
658
659 static mmx_t mask_one = {0x0101010101010101LL};
660
661 static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref,
662 int stride, int cpu)
663 {
664 movq_m2r (*ref, mm0);
665 movq_m2r (*(ref+1), mm1);
666 movq_r2r (mm0, mm7);
667 pxor_r2r (mm1, mm7);
668 pavg_r2r (mm1, mm0);
669 ref += stride;
670
671 do {
672 movq_m2r (*ref, mm2);
673 movq_r2r (mm0, mm5);
674
675 movq_m2r (*(ref+1), mm3);
676 movq_r2r (mm2, mm6);
677
678 pxor_r2r (mm3, mm6);
679 pavg_r2r (mm3, mm2);
680
681 por_r2r (mm6, mm7);
682 pxor_r2r (mm2, mm5);
683
684 pand_r2r (mm5, mm7);
685 pavg_r2r (mm2, mm0);
686
687 pand_m2r (mask_one, mm7);
688
689 psubusb_r2r (mm7, mm0);
690
691 ref += stride;
692 movq_r2m (mm0, *dest);
693 dest += stride;
694
695 movq_r2r (mm6, mm7); // unroll !
696 movq_r2r (mm2, mm0); // unroll !
697 } while (--height);
698 }
699
700 static inline void MC_put4_16 (int height, uint8_t * dest, uint8_t * ref,
701 int stride, int cpu)
702 {
703 do {
704 movq_m2r (*ref, mm0);
705 movq_m2r (*(ref+stride+1), mm1);
706 movq_r2r (mm0, mm7);
707 movq_m2r (*(ref+1), mm2);
708 pxor_r2r (mm1, mm7);
709 movq_m2r (*(ref+stride), mm3);
710 movq_r2r (mm2, mm6);
711 pxor_r2r (mm3, mm6);
712 pavg_r2r (mm1, mm0);
713 pavg_r2r (mm3, mm2);
714 por_r2r (mm6, mm7);
715 movq_r2r (mm0, mm6);
716 pxor_r2r (mm2, mm6);
717 pand_r2r (mm6, mm7);
718 pand_m2r (mask_one, mm7);
719 pavg_r2r (mm2, mm0);
720 psubusb_r2r (mm7, mm0);
721 movq_r2m (mm0, *dest);
722
723 movq_m2r (*(ref+8), mm0);
724 movq_m2r (*(ref+stride+9), mm1);
725 movq_r2r (mm0, mm7);
726 movq_m2r (*(ref+9), mm2);
727 pxor_r2r (mm1, mm7);
728 movq_m2r (*(ref+stride+8), mm3);
729 movq_r2r (mm2, mm6);
730 pxor_r2r (mm3, mm6);
731 pavg_r2r (mm1, mm0);
732 pavg_r2r (mm3, mm2);
733 por_r2r (mm6, mm7);
734 movq_r2r (mm0, mm6);
735 pxor_r2r (mm2, mm6);
736 pand_r2r (mm6, mm7);
737 pand_m2r (mask_one, mm7);
738 pavg_r2r (mm2, mm0);
739 psubusb_r2r (mm7, mm0);
740 ref += stride;
741 movq_r2m (mm0, *(dest+8));
742 dest += stride;
743 } while (--height);
744 }
745
746 static inline void MC_avg4_8 (int height, uint8_t * dest, uint8_t * ref,
747 int stride, int cpu)
748 {
749 do {
750 movq_m2r (*ref, mm0);
751 movq_m2r (*(ref+stride+1), mm1);
752 movq_r2r (mm0, mm7);
753 movq_m2r (*(ref+1), mm2);
754 pxor_r2r (mm1, mm7);
755 movq_m2r (*(ref+stride), mm3);
756 movq_r2r (mm2, mm6);
757 pxor_r2r (mm3, mm6);
758 pavg_r2r (mm1, mm0);
759 pavg_r2r (mm3, mm2);
760 por_r2r (mm6, mm7);
761 movq_r2r (mm0, mm6);
762 pxor_r2r (mm2, mm6);
763 pand_r2r (mm6, mm7);
764 pand_m2r (mask_one, mm7);
765 pavg_r2r (mm2, mm0);
766 psubusb_r2r (mm7, mm0);
767 movq_m2r (*dest, mm1);
768 pavg_r2r (mm1, mm0);
769 ref += stride;
770 movq_r2m (mm0, *dest);
771 dest += stride;
772 } while (--height);
773 }
774
775 static inline void MC_avg4_16 (int height, uint8_t * dest, uint8_t * ref,
776 int stride, int cpu)
777 {
778 do {
779 movq_m2r (*ref, mm0);
780 movq_m2r (*(ref+stride+1), mm1);
781 movq_r2r (mm0, mm7);
782 movq_m2r (*(ref+1), mm2);
783 pxor_r2r (mm1, mm7);
784 movq_m2r (*(ref+stride), mm3);
785 movq_r2r (mm2, mm6);
786 pxor_r2r (mm3, mm6);
787 pavg_r2r (mm1, mm0);
788 pavg_r2r (mm3, mm2);
789 por_r2r (mm6, mm7);
790 movq_r2r (mm0, mm6);
791 pxor_r2r (mm2, mm6);
792 pand_r2r (mm6, mm7);
793 pand_m2r (mask_one, mm7);
794 pavg_r2r (mm2, mm0);
795 psubusb_r2r (mm7, mm0);
796 movq_m2r (*dest, mm1);
797 pavg_r2r (mm1, mm0);
798 movq_r2m (mm0, *dest);
799
800 movq_m2r (*(ref+8), mm0);
801 movq_m2r (*(ref+stride+9), mm1);
802 movq_r2r (mm0, mm7);
803 movq_m2r (*(ref+9), mm2);
804 pxor_r2r (mm1, mm7);
805 movq_m2r (*(ref+stride+8), mm3);
806 movq_r2r (mm2, mm6);
807 pxor_r2r (mm3, mm6);
808 pavg_r2r (mm1, mm0);
809 pavg_r2r (mm3, mm2);
810 por_r2r (mm6, mm7);
811 movq_r2r (mm0, mm6);
812 pxor_r2r (mm2, mm6);
813 pand_r2r (mm6, mm7);
814 pand_m2r (mask_one, mm7);
815 pavg_r2r (mm2, mm0);
816 psubusb_r2r (mm7, mm0);
817 movq_m2r (*(dest+8), mm1);
818 pavg_r2r (mm1, mm0);
819 ref += stride;
820 movq_r2m (mm0, *(dest+8));
821 dest += stride;
822 } while (--height);
823 }
824
825 static void MC_avg_16_mmxext (uint8_t * dest, uint8_t * ref,
826 int stride, int height)
827 {
828 MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT);
829 }
830
831 static void MC_avg_8_mmxext (uint8_t * dest, uint8_t * ref,
832 int stride, int height)
833 {
834 MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT);
835 }
836
837 static void MC_put_16_mmxext (uint8_t * dest, uint8_t * ref,
838 int stride, int height)
839 {
840 MC_put1_16 (height, dest, ref, stride);
841 }
842
843 static void MC_put_8_mmxext (uint8_t * dest, uint8_t * ref,
844 int stride, int height)
845 {
846 MC_put1_8 (height, dest, ref, stride);
847 }
848
849 static void MC_avg_x16_mmxext (uint8_t * dest, uint8_t * ref,
850 int stride, int height)
851 {
852 MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
853 }
854
855 static void MC_avg_x8_mmxext (uint8_t * dest, uint8_t * ref,
856 int stride, int height)
857 {
858 MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
859 }
860
861 static void MC_put_x16_mmxext (uint8_t * dest, uint8_t * ref,
862 int stride, int height)
863 {
864 MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
865 }
866
867 static void MC_put_x8_mmxext (uint8_t * dest, uint8_t * ref,
868 int stride, int height)
869 {
870 MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
871 }
872
873 static void MC_avg_y16_mmxext (uint8_t * dest, uint8_t * ref,
874 int stride, int height)
875 {
876 MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
877 }
878
879 static void MC_avg_y8_mmxext (uint8_t * dest, uint8_t * ref,
880 int stride, int height)
881 {
882 MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
883 }
884
885 static void MC_put_y16_mmxext (uint8_t * dest, uint8_t * ref,
886 int stride, int height)
887 {
888 MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
889 }
890
891 static void MC_put_y8_mmxext (uint8_t * dest, uint8_t * ref,
892 int stride, int height)
893 {
894 MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
895 }
896
897 static void MC_avg_xy16_mmxext (uint8_t * dest, uint8_t * ref,
898 int stride, int height)
899 {
900 MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT);
901 }
902
903 static void MC_avg_xy8_mmxext (uint8_t * dest, uint8_t * ref,
904 int stride, int height)
905 {
906 MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT);
907 }
908
909 static void MC_put_xy16_mmxext (uint8_t * dest, uint8_t * ref,
910 int stride, int height)
911 {
912 MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT);
913 }
914
915 static void MC_put_xy8_mmxext (uint8_t * dest, uint8_t * ref,
916 int stride, int height)
917 {
918 MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT);
919 }
920
921
922 MOTION_COMP_EXTERN (mmxext)
923
924
925
926 static void MC_avg_16_3dnow (uint8_t * dest, uint8_t * ref,
927 int stride, int height)
928 {
929 MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW);
930 }
931
932 static void MC_avg_8_3dnow (uint8_t * dest, uint8_t * ref,
933 int stride, int height)
934 {
935 MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW);
936 }
937
938 static void MC_put_16_3dnow (uint8_t * dest, uint8_t * ref,
939 int stride, int height)
940 {
941 MC_put1_16 (height, dest, ref, stride);
942 }
943
944 static void MC_put_8_3dnow (uint8_t * dest, uint8_t * ref,
945 int stride, int height)
946 {
947 MC_put1_8 (height, dest, ref, stride);
948 }
949
950 static void MC_avg_x16_3dnow (uint8_t * dest, uint8_t * ref,
951 int stride, int height)
952 {
953 MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
954 }
955
956 static void MC_avg_x8_3dnow (uint8_t * dest, uint8_t * ref,
957 int stride, int height)
958 {
959 MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
960 }
961
962 static void MC_put_x16_3dnow (uint8_t * dest, uint8_t * ref,
963 int stride, int height)
964 {
965 MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
966 }
967
968 static void MC_put_x8_3dnow (uint8_t * dest, uint8_t * ref,
969 int stride, int height)
970 {
971 MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
972 }
973
974 static void MC_avg_y16_3dnow (uint8_t * dest, uint8_t * ref,
975 int stride, int height)
976 {
977 MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
978 }
979
980 static void MC_avg_y8_3dnow (uint8_t * dest, uint8_t * ref,
981 int stride, int height)
982 {
983 MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
984 }
985
986 static void MC_put_y16_3dnow (uint8_t * dest, uint8_t * ref,
987 int stride, int height)
988 {
989 MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
990 }
991
992 static void MC_put_y8_3dnow (uint8_t * dest, uint8_t * ref,
993 int stride, int height)
994 {
995 MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
996 }
997
998 static void MC_avg_xy16_3dnow (uint8_t * dest, uint8_t * ref,
999 int stride, int height)
1000 {
1001 MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW);
1002 }
1003
1004 static void MC_avg_xy8_3dnow (uint8_t * dest, uint8_t * ref,
1005 int stride, int height)
1006 {
1007 MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW);
1008 }
1009
1010 static void MC_put_xy16_3dnow (uint8_t * dest, uint8_t * ref,
1011 int stride, int height)
1012 {
1013 MC_put4_16 (height, dest, ref, stride, CPU_3DNOW);
1014 }
1015
1016 static void MC_put_xy8_3dnow (uint8_t * dest, uint8_t * ref,
1017 int stride, int height)
1018 {
1019 MC_put4_8 (height, dest, ref, stride, CPU_3DNOW);
1020 }
1021
1022
1023 MOTION_COMP_EXTERN (3dnow)
1024
1025 #endif