Mercurial > mplayer.hg
annotate libmpeg2/motion_comp_mmx.c @ 17023:dd5be8f8d16d
buffering in the muxer layer; patch by Corey Hickey (bugfood-ml ad fatooh punctum org) plus small fixes by me
author | nicodvb |
---|---|
date | Mon, 21 Nov 2005 22:53:14 +0000 |
parents | 1385ec491ffb |
children | f580a7755ac5 |
rev | line source |
---|---|
1 | 1 /* |
2 * motion_comp_mmx.c | |
10303 | 3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> |
9852 | 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
1 | 5 * |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9852 | 7 * See http://libmpeg2.sourceforge.net/ for updates. |
1 | 8 * |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
22 * |
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
23 * Modified for use with MPlayer, see libmpeg-0.4.0.diff for the exact changes. |
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
24 * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/ |
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
25 * $Id$ |
1 | 26 */ |
27 | |
28 #include "config.h" | |
29 | |
13864 | 30 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1 | 31 |
32 #include <inttypes.h> | |
33 | |
9852 | 34 #include "mpeg2.h" |
12932 | 35 #include "attributes.h" |
1 | 36 #include "mpeg2_internal.h" |
37 #include "mmx.h" | |
38 | |
39 #define CPU_MMXEXT 0 | |
40 #define CPU_3DNOW 1 | |
41 | |
42 | |
36 | 43 /* MMX code - needs a rewrite */ |
1 | 44 |
9852 | 45 /* |
46 * Motion Compensation frequently needs to average values using the | |
47 * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction | |
48 * to compute this, but it's been left out of classic MMX. | |
49 * | |
50 * We need to be careful of overflows when doing this computation. | |
51 * Rather than unpacking data to 16-bits, which reduces parallelism, | |
52 * we use the following formulas: | |
53 * | |
54 * (x+y)>>1 == (x&y)+((x^y)>>1) | |
55 * (x+y+1)>>1 == (x|y)-((x^y)>>1) | |
56 */ | |
1 | 57 |
36 | 58 /* some rounding constants */ |
9852 | 59 static mmx_t mask1 = {0xfefefefefefefefeLL}; |
60 static mmx_t round4 = {0x0002000200020002LL}; | |
1 | 61 |
62 /* | |
63 * This code should probably be compiled with loop unrolling | |
64 * (ie, -funroll-loops in gcc)becuase some of the loops | |
65 * use a small static number of iterations. This was written | |
66 * with the assumption the compiler knows best about when | |
67 * unrolling will help | |
68 */ | |
69 | |
70 static inline void mmx_zero_reg () | |
71 { | |
36 | 72 /* load 0 into mm0 */ |
1 | 73 pxor_r2r (mm0, mm0); |
74 } | |
75 | |
9852 | 76 static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1, |
77 const uint8_t * src2) | |
1 | 78 { |
36 | 79 /* *dest = (*src1 + *src2 + 1)/ 2; */ |
1 | 80 |
9852 | 81 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
82 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 83 |
9852 | 84 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
85 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 86 |
9852 | 87 pxor_r2r (mm1, mm3); /* xor src1 and src2 */ |
88 pand_m2r (mask1, mm3); /* mask lower bits */ | |
89 psrlq_i2r (1, mm3); /* /2 */ | |
90 por_r2r (mm2, mm4); /* or src1 and src2 */ | |
91 psubb_r2r (mm3, mm4); /* subtract subresults */ | |
92 movq_r2m (mm4, *dest); /* store result in dest */ | |
1 | 93 } |
94 | |
95 static inline void mmx_interp_average_2_U8 (uint8_t * dest, | |
9852 | 96 const uint8_t * src1, |
97 const uint8_t * src2) | |
1 | 98 { |
36 | 99 /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ |
1 | 100 |
9852 | 101 movq_m2r (*dest, mm1); /* load 8 dest bytes */ |
102 movq_r2r (mm1, mm2); /* copy 8 dest bytes */ | |
1 | 103 |
9852 | 104 movq_m2r (*src1, mm3); /* load 8 src1 bytes */ |
105 movq_r2r (mm3, mm4); /* copy 8 src1 bytes */ | |
1 | 106 |
9852 | 107 movq_m2r (*src2, mm5); /* load 8 src2 bytes */ |
108 movq_r2r (mm5, mm6); /* copy 8 src2 bytes */ | |
1 | 109 |
9852 | 110 pxor_r2r (mm3, mm5); /* xor src1 and src2 */ |
111 pand_m2r (mask1, mm5); /* mask lower bits */ | |
112 psrlq_i2r (1, mm5); /* /2 */ | |
113 por_r2r (mm4, mm6); /* or src1 and src2 */ | |
114 psubb_r2r (mm5, mm6); /* subtract subresults */ | |
115 movq_r2r (mm6, mm5); /* copy subresult */ | |
1 | 116 |
9852 | 117 pxor_r2r (mm1, mm5); /* xor srcavg and dest */ |
118 pand_m2r (mask1, mm5); /* mask lower bits */ | |
119 psrlq_i2r (1, mm5); /* /2 */ | |
120 por_r2r (mm2, mm6); /* or srcavg and dest */ | |
121 psubb_r2r (mm5, mm6); /* subtract subresults */ | |
122 movq_r2m (mm6, *dest); /* store result in dest */ | |
1 | 123 } |
124 | |
9852 | 125 static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1, |
126 const uint8_t * src2, | |
127 const uint8_t * src3, | |
128 const uint8_t * src4) | |
1 | 129 { |
36 | 130 /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */ |
1 | 131 |
9852 | 132 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
133 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 134 |
9852 | 135 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ |
136 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ | |
1 | 137 |
9852 | 138 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
139 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 140 |
9852 | 141 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ |
142 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ | |
1 | 143 |
9852 | 144 paddw_r2r (mm3, mm1); /* add lows */ |
145 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 146 |
36 | 147 /* now have partials in mm1 and mm2 */ |
1 | 148 |
9852 | 149 movq_m2r (*src3, mm3); /* load 8 src3 bytes */ |
150 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ | |
1 | 151 |
9852 | 152 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ |
153 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ | |
1 | 154 |
9852 | 155 paddw_r2r (mm3, mm1); /* add lows */ |
156 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 157 |
9852 | 158 movq_m2r (*src4, mm5); /* load 8 src4 bytes */ |
159 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ | |
1 | 160 |
9852 | 161 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ |
162 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ | |
1 | 163 |
9852 | 164 paddw_r2r (mm5, mm1); /* add lows */ |
165 paddw_r2r (mm6, mm2); /* add highs */ | |
1 | 166 |
36 | 167 /* now have subtotal in mm1 and mm2 */ |
1 | 168 |
169 paddw_m2r (round4, mm1); | |
9852 | 170 psraw_i2r (2, mm1); /* /4 */ |
1 | 171 paddw_m2r (round4, mm2); |
9852 | 172 psraw_i2r (2, mm2); /* /4 */ |
1 | 173 |
9852 | 174 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ |
175 movq_r2m (mm1, *dest); /* store result in dest */ | |
1 | 176 } |
177 | |
178 static inline void mmx_interp_average_4_U8 (uint8_t * dest, | |
9852 | 179 const uint8_t * src1, |
180 const uint8_t * src2, | |
181 const uint8_t * src3, | |
182 const uint8_t * src4) | |
1 | 183 { |
36 | 184 /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */ |
1 | 185 |
9852 | 186 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
187 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 188 |
9852 | 189 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ |
190 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ | |
1 | 191 |
9852 | 192 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
193 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 194 |
9852 | 195 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ |
196 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ | |
1 | 197 |
9852 | 198 paddw_r2r (mm3, mm1); /* add lows */ |
199 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 200 |
36 | 201 /* now have partials in mm1 and mm2 */ |
1 | 202 |
9852 | 203 movq_m2r (*src3, mm3); /* load 8 src3 bytes */ |
204 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ | |
1 | 205 |
9852 | 206 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ |
207 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ | |
1 | 208 |
9852 | 209 paddw_r2r (mm3, mm1); /* add lows */ |
210 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 211 |
9852 | 212 movq_m2r (*src4, mm5); /* load 8 src4 bytes */ |
213 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ | |
1 | 214 |
9852 | 215 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ |
216 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ | |
1 | 217 |
9852 | 218 paddw_r2r (mm5, mm1); /* add lows */ |
219 paddw_r2r (mm6, mm2); /* add highs */ | |
1 | 220 |
221 paddw_m2r (round4, mm1); | |
9852 | 222 psraw_i2r (2, mm1); /* /4 */ |
1 | 223 paddw_m2r (round4, mm2); |
9852 | 224 psraw_i2r (2, mm2); /* /4 */ |
1 | 225 |
36 | 226 /* now have subtotal/4 in mm1 and mm2 */ |
1 | 227 |
9852 | 228 movq_m2r (*dest, mm3); /* load 8 dest bytes */ |
229 movq_r2r (mm3, mm4); /* copy 8 dest bytes */ | |
1 | 230 |
9852 | 231 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ |
232 movq_r2r (mm1,mm2); /* copy subresult */ | |
1 | 233 |
9852 | 234 pxor_r2r (mm1, mm3); /* xor srcavg and dest */ |
235 pand_m2r (mask1, mm3); /* mask lower bits */ | |
236 psrlq_i2r (1, mm3); /* /2 */ | |
237 por_r2r (mm2, mm4); /* or srcavg and dest */ | |
238 psubb_r2r (mm3, mm4); /* subtract subresults */ | |
239 movq_r2m (mm4, *dest); /* store result in dest */ | |
1 | 240 } |
241 | |
36 | 242 /*-----------------------------------------------------------------------*/ |
1 | 243 |
9852 | 244 static inline void MC_avg_mmx (const int width, int height, uint8_t * dest, |
245 const uint8_t * ref, const int stride) | |
1 | 246 { |
247 mmx_zero_reg (); | |
248 | |
249 do { | |
250 mmx_average_2_U8 (dest, dest, ref); | |
251 | |
252 if (width == 16) | |
253 mmx_average_2_U8 (dest+8, dest+8, ref+8); | |
254 | |
255 dest += stride; | |
256 ref += stride; | |
257 } while (--height); | |
258 } | |
259 | |
9852 | 260 static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref, |
261 int stride, int height) | |
1 | 262 { |
263 MC_avg_mmx (16, height, dest, ref, stride); | |
264 } | |
265 | |
9852 | 266 static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref, |
267 int stride, int height) | |
1 | 268 { |
269 MC_avg_mmx (8, height, dest, ref, stride); | |
270 } | |
271 | |
36 | 272 /*-----------------------------------------------------------------------*/ |
1 | 273 |
9852 | 274 static inline void MC_put_mmx (const int width, int height, uint8_t * dest, |
275 const uint8_t * ref, const int stride) | |
1 | 276 { |
277 mmx_zero_reg (); | |
278 | |
279 do { | |
9852 | 280 movq_m2r (* ref, mm1); /* load 8 ref bytes */ |
281 movq_r2m (mm1,* dest); /* store 8 bytes at curr */ | |
1 | 282 |
283 if (width == 16) | |
284 { | |
9852 | 285 movq_m2r (* (ref+8), mm1); /* load 8 ref bytes */ |
286 movq_r2m (mm1,* (dest+8)); /* store 8 bytes at curr */ | |
1 | 287 } |
288 | |
289 dest += stride; | |
290 ref += stride; | |
291 } while (--height); | |
292 } | |
293 | |
9852 | 294 static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref, |
295 int stride, int height) | |
1 | 296 { |
297 MC_put_mmx (16, height, dest, ref, stride); | |
298 } | |
299 | |
9852 | 300 static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref, |
301 int stride, int height) | |
1 | 302 { |
303 MC_put_mmx (8, height, dest, ref, stride); | |
304 } | |
305 | |
36 | 306 /*-----------------------------------------------------------------------*/ |
1 | 307 |
36 | 308 /* Half pixel interpolation in the x direction */ |
9852 | 309 static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest, |
310 const uint8_t * ref, const int stride) | |
1 | 311 { |
312 mmx_zero_reg (); | |
313 | |
314 do { | |
315 mmx_interp_average_2_U8 (dest, ref, ref+1); | |
316 | |
317 if (width == 16) | |
318 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9); | |
319 | |
320 dest += stride; | |
321 ref += stride; | |
322 } while (--height); | |
323 } | |
324 | |
9852 | 325 static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref, |
326 int stride, int height) | |
1 | 327 { |
328 MC_avg_x_mmx (16, height, dest, ref, stride); | |
329 } | |
330 | |
9852 | 331 static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref, |
332 int stride, int height) | |
1 | 333 { |
334 MC_avg_x_mmx (8, height, dest, ref, stride); | |
335 } | |
336 | |
36 | 337 /*-----------------------------------------------------------------------*/ |
1 | 338 |
9852 | 339 static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest, |
340 const uint8_t * ref, const int stride) | |
1 | 341 { |
342 mmx_zero_reg (); | |
343 | |
344 do { | |
345 mmx_average_2_U8 (dest, ref, ref+1); | |
346 | |
347 if (width == 16) | |
348 mmx_average_2_U8 (dest+8, ref+8, ref+9); | |
349 | |
350 dest += stride; | |
351 ref += stride; | |
352 } while (--height); | |
353 } | |
354 | |
9852 | 355 static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref, |
356 int stride, int height) | |
1 | 357 { |
358 MC_put_x_mmx (16, height, dest, ref, stride); | |
359 } | |
360 | |
9852 | 361 static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref, |
362 int stride, int height) | |
1 | 363 { |
364 MC_put_x_mmx (8, height, dest, ref, stride); | |
365 } | |
366 | |
36 | 367 /*-----------------------------------------------------------------------*/ |
1 | 368 |
9852 | 369 static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest, |
370 const uint8_t * ref, const int stride) | |
1 | 371 { |
9852 | 372 const uint8_t * ref_next = ref + stride; |
1 | 373 |
374 mmx_zero_reg (); | |
375 | |
376 do { | |
377 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
378 | |
379 if (width == 16) | |
380 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9, | |
381 ref_next+8, ref_next+9); | |
382 | |
383 dest += stride; | |
384 ref += stride; | |
385 ref_next += stride; | |
386 } while (--height); | |
387 } | |
388 | |
9852 | 389 static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref, |
390 int stride, int height) | |
1 | 391 { |
392 MC_avg_xy_mmx (16, height, dest, ref, stride); | |
393 } | |
394 | |
9852 | 395 static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref, |
396 int stride, int height) | |
1 | 397 { |
398 MC_avg_xy_mmx (8, height, dest, ref, stride); | |
399 } | |
400 | |
36 | 401 /*-----------------------------------------------------------------------*/ |
1 | 402 |
9852 | 403 static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest, |
404 const uint8_t * ref, const int stride) | |
1 | 405 { |
9852 | 406 const uint8_t * ref_next = ref + stride; |
1 | 407 |
408 mmx_zero_reg (); | |
409 | |
410 do { | |
411 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
412 | |
413 if (width == 16) | |
414 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9); | |
415 | |
416 dest += stride; | |
417 ref += stride; | |
418 ref_next += stride; | |
419 } while (--height); | |
420 } | |
421 | |
9852 | 422 static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref, |
423 int stride, int height) | |
1 | 424 { |
425 MC_put_xy_mmx (16, height, dest, ref, stride); | |
426 } | |
427 | |
9852 | 428 static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref, |
429 int stride, int height) | |
1 | 430 { |
431 MC_put_xy_mmx (8, height, dest, ref, stride); | |
432 } | |
433 | |
36 | 434 /*-----------------------------------------------------------------------*/ |
1 | 435 |
9852 | 436 static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest, |
437 const uint8_t * ref, const int stride) | |
1 | 438 { |
9852 | 439 const uint8_t * ref_next = ref + stride; |
1 | 440 |
441 mmx_zero_reg (); | |
442 | |
443 do { | |
444 mmx_interp_average_2_U8 (dest, ref, ref_next); | |
445 | |
446 if (width == 16) | |
447 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8); | |
448 | |
449 dest += stride; | |
450 ref += stride; | |
451 ref_next += stride; | |
452 } while (--height); | |
453 } | |
454 | |
9852 | 455 static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref, |
456 int stride, int height) | |
1 | 457 { |
458 MC_avg_y_mmx (16, height, dest, ref, stride); | |
459 } | |
460 | |
9852 | 461 static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref, |
462 int stride, int height) | |
1 | 463 { |
464 MC_avg_y_mmx (8, height, dest, ref, stride); | |
465 } | |
466 | |
36 | 467 /*-----------------------------------------------------------------------*/ |
1 | 468 |
9852 | 469 static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest, |
470 const uint8_t * ref, const int stride) | |
1 | 471 { |
9852 | 472 const uint8_t * ref_next = ref + stride; |
1 | 473 |
474 mmx_zero_reg (); | |
475 | |
476 do { | |
477 mmx_average_2_U8 (dest, ref, ref_next); | |
478 | |
479 if (width == 16) | |
480 mmx_average_2_U8 (dest+8, ref+8, ref_next+8); | |
481 | |
482 dest += stride; | |
483 ref += stride; | |
484 ref_next += stride; | |
485 } while (--height); | |
486 } | |
487 | |
9852 | 488 static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref, |
489 int stride, int height) | |
1 | 490 { |
491 MC_put_y_mmx (16, height, dest, ref, stride); | |
492 } | |
493 | |
9852 | 494 static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref, |
495 int stride, int height) | |
1 | 496 { |
497 MC_put_y_mmx (8, height, dest, ref, stride); | |
498 } | |
499 | |
500 | |
9852 | 501 MPEG2_MC_EXTERN (mmx) |
1 | 502 |
503 | |
504 | |
505 | |
506 | |
507 | |
508 | |
36 | 509 /* CPU_MMXEXT/CPU_3DNOW adaptation layer */ |
1 | 510 |
511 #define pavg_r2r(src,dest) \ | |
512 do { \ | |
513 if (cpu == CPU_MMXEXT) \ | |
514 pavgb_r2r (src, dest); \ | |
515 else \ | |
516 pavgusb_r2r (src, dest); \ | |
517 } while (0) | |
518 | |
519 #define pavg_m2r(src,dest) \ | |
520 do { \ | |
521 if (cpu == CPU_MMXEXT) \ | |
522 pavgb_m2r (src, dest); \ | |
523 else \ | |
524 pavgusb_m2r (src, dest); \ | |
525 } while (0) | |
526 | |
527 | |
36 | 528 /* CPU_MMXEXT code */ |
1 | 529 |
530 | |
9852 | 531 static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref, |
532 const int stride) | |
1 | 533 { |
534 do { | |
535 movq_m2r (*ref, mm0); | |
536 movq_r2m (mm0, *dest); | |
537 ref += stride; | |
538 dest += stride; | |
539 } while (--height); | |
540 } | |
541 | |
9852 | 542 static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref, |
543 const int stride) | |
1 | 544 { |
545 do { | |
546 movq_m2r (*ref, mm0); | |
547 movq_m2r (*(ref+8), mm1); | |
548 ref += stride; | |
549 movq_r2m (mm0, *dest); | |
550 movq_r2m (mm1, *(dest+8)); | |
551 dest += stride; | |
552 } while (--height); | |
553 } | |
554 | |
9852 | 555 static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref, |
556 const int stride, const int cpu) | |
1 | 557 { |
558 do { | |
559 movq_m2r (*ref, mm0); | |
560 pavg_m2r (*dest, mm0); | |
561 ref += stride; | |
562 movq_r2m (mm0, *dest); | |
563 dest += stride; | |
564 } while (--height); | |
565 } | |
566 | |
9852 | 567 static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref, |
568 const int stride, const int cpu) | |
1 | 569 { |
570 do { | |
571 movq_m2r (*ref, mm0); | |
572 movq_m2r (*(ref+8), mm1); | |
573 pavg_m2r (*dest, mm0); | |
574 pavg_m2r (*(dest+8), mm1); | |
575 movq_r2m (mm0, *dest); | |
576 ref += stride; | |
577 movq_r2m (mm1, *(dest+8)); | |
578 dest += stride; | |
579 } while (--height); | |
580 } | |
581 | |
9852 | 582 static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref, |
583 const int stride, const int offset, | |
584 const int cpu) | |
1 | 585 { |
586 do { | |
587 movq_m2r (*ref, mm0); | |
588 pavg_m2r (*(ref+offset), mm0); | |
589 ref += stride; | |
590 movq_r2m (mm0, *dest); | |
591 dest += stride; | |
592 } while (--height); | |
593 } | |
594 | |
9852 | 595 static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref, |
596 const int stride, const int offset, | |
597 const int cpu) | |
1 | 598 { |
599 do { | |
600 movq_m2r (*ref, mm0); | |
601 movq_m2r (*(ref+8), mm1); | |
602 pavg_m2r (*(ref+offset), mm0); | |
603 pavg_m2r (*(ref+offset+8), mm1); | |
604 movq_r2m (mm0, *dest); | |
605 ref += stride; | |
606 movq_r2m (mm1, *(dest+8)); | |
607 dest += stride; | |
608 } while (--height); | |
609 } | |
610 | |
9852 | 611 static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref, |
612 const int stride, const int offset, | |
613 const int cpu) | |
1 | 614 { |
615 do { | |
616 movq_m2r (*ref, mm0); | |
617 pavg_m2r (*(ref+offset), mm0); | |
618 pavg_m2r (*dest, mm0); | |
619 ref += stride; | |
620 movq_r2m (mm0, *dest); | |
621 dest += stride; | |
622 } while (--height); | |
623 } | |
624 | |
9852 | 625 static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref, |
626 const int stride, const int offset, | |
627 const int cpu) | |
1 | 628 { |
629 do { | |
630 movq_m2r (*ref, mm0); | |
631 movq_m2r (*(ref+8), mm1); | |
632 pavg_m2r (*(ref+offset), mm0); | |
633 pavg_m2r (*(ref+offset+8), mm1); | |
634 pavg_m2r (*dest, mm0); | |
635 pavg_m2r (*(dest+8), mm1); | |
636 ref += stride; | |
637 movq_r2m (mm0, *dest); | |
638 movq_r2m (mm1, *(dest+8)); | |
639 dest += stride; | |
640 } while (--height); | |
641 } | |
642 | |
643 static mmx_t mask_one = {0x0101010101010101LL}; | |
644 | |
9852 | 645 static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref, |
646 const int stride, const int cpu) | |
1 | 647 { |
648 movq_m2r (*ref, mm0); | |
649 movq_m2r (*(ref+1), mm1); | |
650 movq_r2r (mm0, mm7); | |
651 pxor_r2r (mm1, mm7); | |
652 pavg_r2r (mm1, mm0); | |
653 ref += stride; | |
654 | |
655 do { | |
656 movq_m2r (*ref, mm2); | |
657 movq_r2r (mm0, mm5); | |
658 | |
659 movq_m2r (*(ref+1), mm3); | |
660 movq_r2r (mm2, mm6); | |
661 | |
662 pxor_r2r (mm3, mm6); | |
663 pavg_r2r (mm3, mm2); | |
664 | |
665 por_r2r (mm6, mm7); | |
666 pxor_r2r (mm2, mm5); | |
667 | |
668 pand_r2r (mm5, mm7); | |
669 pavg_r2r (mm2, mm0); | |
670 | |
671 pand_m2r (mask_one, mm7); | |
672 | |
673 psubusb_r2r (mm7, mm0); | |
674 | |
675 ref += stride; | |
676 movq_r2m (mm0, *dest); | |
677 dest += stride; | |
678 | |
9852 | 679 movq_r2r (mm6, mm7); /* unroll ! */ |
680 movq_r2r (mm2, mm0); /* unroll ! */ | |
1 | 681 } while (--height); |
682 } | |
683 | |
9852 | 684 static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref, |
685 const int stride, const int cpu) | |
1 | 686 { |
687 do { | |
688 movq_m2r (*ref, mm0); | |
689 movq_m2r (*(ref+stride+1), mm1); | |
690 movq_r2r (mm0, mm7); | |
691 movq_m2r (*(ref+1), mm2); | |
692 pxor_r2r (mm1, mm7); | |
693 movq_m2r (*(ref+stride), mm3); | |
694 movq_r2r (mm2, mm6); | |
695 pxor_r2r (mm3, mm6); | |
696 pavg_r2r (mm1, mm0); | |
697 pavg_r2r (mm3, mm2); | |
698 por_r2r (mm6, mm7); | |
699 movq_r2r (mm0, mm6); | |
700 pxor_r2r (mm2, mm6); | |
701 pand_r2r (mm6, mm7); | |
702 pand_m2r (mask_one, mm7); | |
703 pavg_r2r (mm2, mm0); | |
704 psubusb_r2r (mm7, mm0); | |
705 movq_r2m (mm0, *dest); | |
706 | |
707 movq_m2r (*(ref+8), mm0); | |
708 movq_m2r (*(ref+stride+9), mm1); | |
709 movq_r2r (mm0, mm7); | |
710 movq_m2r (*(ref+9), mm2); | |
711 pxor_r2r (mm1, mm7); | |
712 movq_m2r (*(ref+stride+8), mm3); | |
713 movq_r2r (mm2, mm6); | |
714 pxor_r2r (mm3, mm6); | |
715 pavg_r2r (mm1, mm0); | |
716 pavg_r2r (mm3, mm2); | |
717 por_r2r (mm6, mm7); | |
718 movq_r2r (mm0, mm6); | |
719 pxor_r2r (mm2, mm6); | |
720 pand_r2r (mm6, mm7); | |
721 pand_m2r (mask_one, mm7); | |
722 pavg_r2r (mm2, mm0); | |
723 psubusb_r2r (mm7, mm0); | |
724 ref += stride; | |
725 movq_r2m (mm0, *(dest+8)); | |
726 dest += stride; | |
727 } while (--height); | |
728 } | |
729 | |
9852 | 730 static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref, |
731 const int stride, const int cpu) | |
1 | 732 { |
733 do { | |
734 movq_m2r (*ref, mm0); | |
735 movq_m2r (*(ref+stride+1), mm1); | |
736 movq_r2r (mm0, mm7); | |
737 movq_m2r (*(ref+1), mm2); | |
738 pxor_r2r (mm1, mm7); | |
739 movq_m2r (*(ref+stride), mm3); | |
740 movq_r2r (mm2, mm6); | |
741 pxor_r2r (mm3, mm6); | |
742 pavg_r2r (mm1, mm0); | |
743 pavg_r2r (mm3, mm2); | |
744 por_r2r (mm6, mm7); | |
745 movq_r2r (mm0, mm6); | |
746 pxor_r2r (mm2, mm6); | |
747 pand_r2r (mm6, mm7); | |
748 pand_m2r (mask_one, mm7); | |
749 pavg_r2r (mm2, mm0); | |
750 psubusb_r2r (mm7, mm0); | |
751 movq_m2r (*dest, mm1); | |
752 pavg_r2r (mm1, mm0); | |
753 ref += stride; | |
754 movq_r2m (mm0, *dest); | |
755 dest += stride; | |
756 } while (--height); | |
757 } | |
758 | |
9852 | 759 static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref, |
760 const int stride, const int cpu) | |
1 | 761 { |
762 do { | |
763 movq_m2r (*ref, mm0); | |
764 movq_m2r (*(ref+stride+1), mm1); | |
765 movq_r2r (mm0, mm7); | |
766 movq_m2r (*(ref+1), mm2); | |
767 pxor_r2r (mm1, mm7); | |
768 movq_m2r (*(ref+stride), mm3); | |
769 movq_r2r (mm2, mm6); | |
770 pxor_r2r (mm3, mm6); | |
771 pavg_r2r (mm1, mm0); | |
772 pavg_r2r (mm3, mm2); | |
773 por_r2r (mm6, mm7); | |
774 movq_r2r (mm0, mm6); | |
775 pxor_r2r (mm2, mm6); | |
776 pand_r2r (mm6, mm7); | |
777 pand_m2r (mask_one, mm7); | |
778 pavg_r2r (mm2, mm0); | |
779 psubusb_r2r (mm7, mm0); | |
780 movq_m2r (*dest, mm1); | |
781 pavg_r2r (mm1, mm0); | |
782 movq_r2m (mm0, *dest); | |
783 | |
784 movq_m2r (*(ref+8), mm0); | |
785 movq_m2r (*(ref+stride+9), mm1); | |
786 movq_r2r (mm0, mm7); | |
787 movq_m2r (*(ref+9), mm2); | |
788 pxor_r2r (mm1, mm7); | |
789 movq_m2r (*(ref+stride+8), mm3); | |
790 movq_r2r (mm2, mm6); | |
791 pxor_r2r (mm3, mm6); | |
792 pavg_r2r (mm1, mm0); | |
793 pavg_r2r (mm3, mm2); | |
794 por_r2r (mm6, mm7); | |
795 movq_r2r (mm0, mm6); | |
796 pxor_r2r (mm2, mm6); | |
797 pand_r2r (mm6, mm7); | |
798 pand_m2r (mask_one, mm7); | |
799 pavg_r2r (mm2, mm0); | |
800 psubusb_r2r (mm7, mm0); | |
801 movq_m2r (*(dest+8), mm1); | |
802 pavg_r2r (mm1, mm0); | |
803 ref += stride; | |
804 movq_r2m (mm0, *(dest+8)); | |
805 dest += stride; | |
806 } while (--height); | |
807 } | |
808 | |
9852 | 809 static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref, |
810 int stride, int height) | |
1 | 811 { |
812 MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT); | |
813 } | |
814 | |
9852 | 815 static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref, |
816 int stride, int height) | |
1 | 817 { |
818 MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT); | |
819 } | |
820 | |
9852 | 821 static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref, |
822 int stride, int height) | |
1 | 823 { |
824 MC_put1_16 (height, dest, ref, stride); | |
825 } | |
826 | |
9852 | 827 static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref, |
828 int stride, int height) | |
1 | 829 { |
830 MC_put1_8 (height, dest, ref, stride); | |
831 } | |
832 | |
9852 | 833 static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref, |
834 int stride, int height) | |
1 | 835 { |
836 MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
837 } | |
838 | |
9852 | 839 static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref, |
840 int stride, int height) | |
1 | 841 { |
842 MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
843 } | |
844 | |
9852 | 845 static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref, |
846 int stride, int height) | |
1 | 847 { |
848 MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
849 } | |
850 | |
9852 | 851 static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref, |
852 int stride, int height) | |
1 | 853 { |
854 MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
855 } | |
856 | |
9852 | 857 static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref, |
858 int stride, int height) | |
1 | 859 { |
860 MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
861 } | |
862 | |
9852 | 863 static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref, |
864 int stride, int height) | |
1 | 865 { |
866 MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
867 } | |
868 | |
9852 | 869 static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref, |
870 int stride, int height) | |
1 | 871 { |
872 MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
873 } | |
874 | |
9852 | 875 static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref, |
876 int stride, int height) | |
1 | 877 { |
878 MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
879 } | |
880 | |
9852 | 881 static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref, |
882 int stride, int height) | |
1 | 883 { |
884 MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
885 } | |
886 | |
9852 | 887 static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref, |
888 int stride, int height) | |
1 | 889 { |
890 MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
891 } | |
892 | |
9852 | 893 static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref, |
894 int stride, int height) | |
1 | 895 { |
896 MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
897 } | |
898 | |
9852 | 899 static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref, |
900 int stride, int height) | |
1 | 901 { |
902 MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
903 } | |
904 | |
905 | |
9852 | 906 MPEG2_MC_EXTERN (mmxext) |
1 | 907 |
908 | |
909 | |
9852 | 910 static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref, |
911 int stride, int height) | |
1 | 912 { |
913 MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW); | |
914 } | |
915 | |
9852 | 916 static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref, |
917 int stride, int height) | |
1 | 918 { |
919 MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW); | |
920 } | |
921 | |
9852 | 922 static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref, |
923 int stride, int height) | |
1 | 924 { |
925 MC_put1_16 (height, dest, ref, stride); | |
926 } | |
927 | |
9852 | 928 static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref, |
929 int stride, int height) | |
1 | 930 { |
931 MC_put1_8 (height, dest, ref, stride); | |
932 } | |
933 | |
9852 | 934 static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 935 int stride, int height) |
936 { | |
937 MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
938 } | |
939 | |
9852 | 940 static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 941 int stride, int height) |
942 { | |
943 MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
944 } | |
945 | |
9852 | 946 static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 947 int stride, int height) |
948 { | |
949 MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
950 } | |
951 | |
9852 | 952 static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 953 int stride, int height) |
954 { | |
955 MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
956 } | |
957 | |
9852 | 958 static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 959 int stride, int height) |
960 { | |
961 MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
962 } | |
963 | |
9852 | 964 static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 965 int stride, int height) |
966 { | |
967 MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
968 } | |
969 | |
9852 | 970 static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 971 int stride, int height) |
972 { | |
973 MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
974 } | |
975 | |
9852 | 976 static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 977 int stride, int height) |
978 { | |
979 MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
980 } | |
981 | |
9852 | 982 static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 983 int stride, int height) |
984 { | |
985 MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW); | |
986 } | |
987 | |
9852 | 988 static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 989 int stride, int height) |
990 { | |
991 MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW); | |
992 } | |
993 | |
9852 | 994 static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 995 int stride, int height) |
996 { | |
997 MC_put4_16 (height, dest, ref, stride, CPU_3DNOW); | |
998 } | |
999 | |
9852 | 1000 static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 1001 int stride, int height) |
1002 { | |
1003 MC_put4_8 (height, dest, ref, stride, CPU_3DNOW); | |
1004 } | |
1005 | |
1006 | |
9852 | 1007 MPEG2_MC_EXTERN (3dnow) |
1 | 1008 |
1009 #endif |