Mercurial > mplayer.hg
annotate libmpeg2/motion_comp_mmx.c @ 18971:ec2f6323fda3
Change SRC_PATH for ffmpeg back to '..' to avoid hardcoding current
directory at configure time. This should work again now that libpostproc
is no longer under libavcodec and all Makefiles included from ffmpeg are
at the same directory level.
The hardcoded paths caused breakage if the build directory was moved or
copied after configure and prevented ccache from sharing compilation
results between directories (different absolute include paths count as
different compiler options).
author | uau |
---|---|
date | Sun, 09 Jul 2006 14:06:13 +0000 |
parents | 0783dd397f74 |
children | 60a39d71e247 |
rev | line source |
---|---|
1 | 1 /* |
2 * motion_comp_mmx.c | |
10303 | 3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> |
9852 | 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
1 | 5 * |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9852 | 7 * See http://libmpeg2.sourceforge.net/ for updates. |
1 | 8 * |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
22 * |
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
23 * Modified for use with MPlayer, see libmpeg-0.4.0.diff for the exact changes. |
18783 | 24 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/ |
14732
1385ec491ffb
Mark locally modified files as such to comply more closely with GPL 2a.
diego
parents:
13864
diff
changeset
|
25 * $Id$ |
1 | 26 */ |
27 | |
28 #include "config.h" | |
29 | |
13864 | 30 #if defined(ARCH_X86) || defined(ARCH_X86_64) |
1 | 31 |
32 #include <inttypes.h> | |
33 | |
9852 | 34 #include "mpeg2.h" |
12932 | 35 #include "attributes.h" |
1 | 36 #include "mpeg2_internal.h" |
37 #include "mmx.h" | |
38 | |
39 #define CPU_MMXEXT 0 | |
40 #define CPU_3DNOW 1 | |
41 | |
42 | |
36 | 43 /* MMX code - needs a rewrite */ |
1 | 44 |
9852 | 45 /* |
46 * Motion Compensation frequently needs to average values using the | |
47 * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction | |
48 * to compute this, but it's been left out of classic MMX. | |
49 * | |
50 * We need to be careful of overflows when doing this computation. | |
51 * Rather than unpacking data to 16-bits, which reduces parallelism, | |
52 * we use the following formulas: | |
53 * | |
54 * (x+y)>>1 == (x&y)+((x^y)>>1) | |
55 * (x+y+1)>>1 == (x|y)-((x^y)>>1) | |
56 */ | |
1 | 57 |
36 | 58 /* some rounding constants */ |
9852 | 59 static mmx_t mask1 = {0xfefefefefefefefeLL}; |
60 static mmx_t round4 = {0x0002000200020002LL}; | |
1 | 61 |
62 /* | |
63 * This code should probably be compiled with loop unrolling | |
64 * (ie, -funroll-loops in gcc)becuase some of the loops | |
65 * use a small static number of iterations. This was written | |
66 * with the assumption the compiler knows best about when | |
67 * unrolling will help | |
68 */ | |
69 | |
17566
f580a7755ac5
Patch by Stefan Huehner / stefan % huehner ! org \
rathann
parents:
14732
diff
changeset
|
70 static inline void mmx_zero_reg (void) |
1 | 71 { |
36 | 72 /* load 0 into mm0 */ |
1 | 73 pxor_r2r (mm0, mm0); |
74 } | |
75 | |
9852 | 76 static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1, |
77 const uint8_t * src2) | |
1 | 78 { |
36 | 79 /* *dest = (*src1 + *src2 + 1)/ 2; */ |
1 | 80 |
9852 | 81 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
82 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 83 |
9852 | 84 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
85 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 86 |
9852 | 87 pxor_r2r (mm1, mm3); /* xor src1 and src2 */ |
88 pand_m2r (mask1, mm3); /* mask lower bits */ | |
89 psrlq_i2r (1, mm3); /* /2 */ | |
90 por_r2r (mm2, mm4); /* or src1 and src2 */ | |
91 psubb_r2r (mm3, mm4); /* subtract subresults */ | |
92 movq_r2m (mm4, *dest); /* store result in dest */ | |
1 | 93 } |
94 | |
95 static inline void mmx_interp_average_2_U8 (uint8_t * dest, | |
9852 | 96 const uint8_t * src1, |
97 const uint8_t * src2) | |
1 | 98 { |
36 | 99 /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ |
1 | 100 |
9852 | 101 movq_m2r (*dest, mm1); /* load 8 dest bytes */ |
102 movq_r2r (mm1, mm2); /* copy 8 dest bytes */ | |
1 | 103 |
9852 | 104 movq_m2r (*src1, mm3); /* load 8 src1 bytes */ |
105 movq_r2r (mm3, mm4); /* copy 8 src1 bytes */ | |
1 | 106 |
9852 | 107 movq_m2r (*src2, mm5); /* load 8 src2 bytes */ |
108 movq_r2r (mm5, mm6); /* copy 8 src2 bytes */ | |
1 | 109 |
9852 | 110 pxor_r2r (mm3, mm5); /* xor src1 and src2 */ |
111 pand_m2r (mask1, mm5); /* mask lower bits */ | |
112 psrlq_i2r (1, mm5); /* /2 */ | |
113 por_r2r (mm4, mm6); /* or src1 and src2 */ | |
114 psubb_r2r (mm5, mm6); /* subtract subresults */ | |
115 movq_r2r (mm6, mm5); /* copy subresult */ | |
1 | 116 |
9852 | 117 pxor_r2r (mm1, mm5); /* xor srcavg and dest */ |
118 pand_m2r (mask1, mm5); /* mask lower bits */ | |
119 psrlq_i2r (1, mm5); /* /2 */ | |
120 por_r2r (mm2, mm6); /* or srcavg and dest */ | |
121 psubb_r2r (mm5, mm6); /* subtract subresults */ | |
122 movq_r2m (mm6, *dest); /* store result in dest */ | |
1 | 123 } |
124 | |
9852 | 125 static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1, |
126 const uint8_t * src2, | |
127 const uint8_t * src3, | |
128 const uint8_t * src4) | |
1 | 129 { |
36 | 130 /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */ |
1 | 131 |
9852 | 132 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
133 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 134 |
9852 | 135 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ |
136 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ | |
1 | 137 |
9852 | 138 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
139 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 140 |
9852 | 141 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ |
142 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ | |
1 | 143 |
9852 | 144 paddw_r2r (mm3, mm1); /* add lows */ |
145 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 146 |
36 | 147 /* now have partials in mm1 and mm2 */ |
1 | 148 |
9852 | 149 movq_m2r (*src3, mm3); /* load 8 src3 bytes */ |
150 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ | |
1 | 151 |
9852 | 152 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ |
153 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ | |
1 | 154 |
9852 | 155 paddw_r2r (mm3, mm1); /* add lows */ |
156 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 157 |
9852 | 158 movq_m2r (*src4, mm5); /* load 8 src4 bytes */ |
159 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ | |
1 | 160 |
9852 | 161 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ |
162 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ | |
1 | 163 |
9852 | 164 paddw_r2r (mm5, mm1); /* add lows */ |
165 paddw_r2r (mm6, mm2); /* add highs */ | |
1 | 166 |
36 | 167 /* now have subtotal in mm1 and mm2 */ |
1 | 168 |
169 paddw_m2r (round4, mm1); | |
9852 | 170 psraw_i2r (2, mm1); /* /4 */ |
1 | 171 paddw_m2r (round4, mm2); |
9852 | 172 psraw_i2r (2, mm2); /* /4 */ |
1 | 173 |
9852 | 174 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ |
175 movq_r2m (mm1, *dest); /* store result in dest */ | |
1 | 176 } |
177 | |
178 static inline void mmx_interp_average_4_U8 (uint8_t * dest, | |
9852 | 179 const uint8_t * src1, |
180 const uint8_t * src2, | |
181 const uint8_t * src3, | |
182 const uint8_t * src4) | |
1 | 183 { |
36 | 184 /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */ |
1 | 185 |
9852 | 186 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
187 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 188 |
9852 | 189 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ |
190 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ | |
1 | 191 |
9852 | 192 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
193 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 194 |
9852 | 195 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ |
196 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ | |
1 | 197 |
9852 | 198 paddw_r2r (mm3, mm1); /* add lows */ |
199 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 200 |
36 | 201 /* now have partials in mm1 and mm2 */ |
1 | 202 |
9852 | 203 movq_m2r (*src3, mm3); /* load 8 src3 bytes */ |
204 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ | |
1 | 205 |
9852 | 206 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ |
207 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ | |
1 | 208 |
9852 | 209 paddw_r2r (mm3, mm1); /* add lows */ |
210 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 211 |
9852 | 212 movq_m2r (*src4, mm5); /* load 8 src4 bytes */ |
213 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ | |
1 | 214 |
9852 | 215 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ |
216 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ | |
1 | 217 |
9852 | 218 paddw_r2r (mm5, mm1); /* add lows */ |
219 paddw_r2r (mm6, mm2); /* add highs */ | |
1 | 220 |
221 paddw_m2r (round4, mm1); | |
9852 | 222 psraw_i2r (2, mm1); /* /4 */ |
1 | 223 paddw_m2r (round4, mm2); |
9852 | 224 psraw_i2r (2, mm2); /* /4 */ |
1 | 225 |
36 | 226 /* now have subtotal/4 in mm1 and mm2 */ |
1 | 227 |
9852 | 228 movq_m2r (*dest, mm3); /* load 8 dest bytes */ |
229 movq_r2r (mm3, mm4); /* copy 8 dest bytes */ | |
1 | 230 |
9852 | 231 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ |
232 movq_r2r (mm1,mm2); /* copy subresult */ | |
1 | 233 |
9852 | 234 pxor_r2r (mm1, mm3); /* xor srcavg and dest */ |
235 pand_m2r (mask1, mm3); /* mask lower bits */ | |
236 psrlq_i2r (1, mm3); /* /2 */ | |
237 por_r2r (mm2, mm4); /* or srcavg and dest */ | |
238 psubb_r2r (mm3, mm4); /* subtract subresults */ | |
239 movq_r2m (mm4, *dest); /* store result in dest */ | |
1 | 240 } |
241 | |
36 | 242 /*-----------------------------------------------------------------------*/ |
1 | 243 |
9852 | 244 static inline void MC_avg_mmx (const int width, int height, uint8_t * dest, |
245 const uint8_t * ref, const int stride) | |
1 | 246 { |
247 mmx_zero_reg (); | |
248 | |
249 do { | |
250 mmx_average_2_U8 (dest, dest, ref); | |
251 | |
252 if (width == 16) | |
253 mmx_average_2_U8 (dest+8, dest+8, ref+8); | |
254 | |
255 dest += stride; | |
256 ref += stride; | |
257 } while (--height); | |
258 } | |
259 | |
9852 | 260 static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref, |
261 int stride, int height) | |
1 | 262 { |
263 MC_avg_mmx (16, height, dest, ref, stride); | |
264 } | |
265 | |
9852 | 266 static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref, |
267 int stride, int height) | |
1 | 268 { |
269 MC_avg_mmx (8, height, dest, ref, stride); | |
270 } | |
271 | |
36 | 272 /*-----------------------------------------------------------------------*/ |
1 | 273 |
9852 | 274 static inline void MC_put_mmx (const int width, int height, uint8_t * dest, |
275 const uint8_t * ref, const int stride) | |
1 | 276 { |
277 mmx_zero_reg (); | |
278 | |
279 do { | |
9852 | 280 movq_m2r (* ref, mm1); /* load 8 ref bytes */ |
281 movq_r2m (mm1,* dest); /* store 8 bytes at curr */ | |
1 | 282 |
283 if (width == 16) | |
284 { | |
9852 | 285 movq_m2r (* (ref+8), mm1); /* load 8 ref bytes */ |
286 movq_r2m (mm1,* (dest+8)); /* store 8 bytes at curr */ | |
1 | 287 } |
288 | |
289 dest += stride; | |
290 ref += stride; | |
291 } while (--height); | |
292 } | |
293 | |
9852 | 294 static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref, |
295 int stride, int height) | |
1 | 296 { |
297 MC_put_mmx (16, height, dest, ref, stride); | |
298 } | |
299 | |
9852 | 300 static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref, |
301 int stride, int height) | |
1 | 302 { |
303 MC_put_mmx (8, height, dest, ref, stride); | |
304 } | |
305 | |
36 | 306 /*-----------------------------------------------------------------------*/ |
1 | 307 |
36 | 308 /* Half pixel interpolation in the x direction */ |
9852 | 309 static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest, |
310 const uint8_t * ref, const int stride) | |
1 | 311 { |
312 mmx_zero_reg (); | |
313 | |
314 do { | |
315 mmx_interp_average_2_U8 (dest, ref, ref+1); | |
316 | |
317 if (width == 16) | |
318 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9); | |
319 | |
320 dest += stride; | |
321 ref += stride; | |
322 } while (--height); | |
323 } | |
324 | |
9852 | 325 static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref, |
326 int stride, int height) | |
1 | 327 { |
328 MC_avg_x_mmx (16, height, dest, ref, stride); | |
329 } | |
330 | |
9852 | 331 static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref, |
332 int stride, int height) | |
1 | 333 { |
334 MC_avg_x_mmx (8, height, dest, ref, stride); | |
335 } | |
336 | |
36 | 337 /*-----------------------------------------------------------------------*/ |
1 | 338 |
9852 | 339 static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest, |
340 const uint8_t * ref, const int stride) | |
1 | 341 { |
342 mmx_zero_reg (); | |
343 | |
344 do { | |
345 mmx_average_2_U8 (dest, ref, ref+1); | |
346 | |
347 if (width == 16) | |
348 mmx_average_2_U8 (dest+8, ref+8, ref+9); | |
349 | |
350 dest += stride; | |
351 ref += stride; | |
352 } while (--height); | |
353 } | |
354 | |
9852 | 355 static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref, |
356 int stride, int height) | |
1 | 357 { |
358 MC_put_x_mmx (16, height, dest, ref, stride); | |
359 } | |
360 | |
9852 | 361 static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref, |
362 int stride, int height) | |
1 | 363 { |
364 MC_put_x_mmx (8, height, dest, ref, stride); | |
365 } | |
366 | |
36 | 367 /*-----------------------------------------------------------------------*/ |
1 | 368 |
9852 | 369 static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest, |
370 const uint8_t * ref, const int stride) | |
1 | 371 { |
9852 | 372 const uint8_t * ref_next = ref + stride; |
1 | 373 |
374 mmx_zero_reg (); | |
375 | |
376 do { | |
377 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
378 | |
379 if (width == 16) | |
380 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9, | |
381 ref_next+8, ref_next+9); | |
382 | |
383 dest += stride; | |
384 ref += stride; | |
385 ref_next += stride; | |
386 } while (--height); | |
387 } | |
388 | |
9852 | 389 static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref, |
390 int stride, int height) | |
1 | 391 { |
392 MC_avg_xy_mmx (16, height, dest, ref, stride); | |
393 } | |
394 | |
9852 | 395 static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref, |
396 int stride, int height) | |
1 | 397 { |
398 MC_avg_xy_mmx (8, height, dest, ref, stride); | |
399 } | |
400 | |
36 | 401 /*-----------------------------------------------------------------------*/ |
1 | 402 |
9852 | 403 static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest, |
404 const uint8_t * ref, const int stride) | |
1 | 405 { |
9852 | 406 const uint8_t * ref_next = ref + stride; |
1 | 407 |
408 mmx_zero_reg (); | |
409 | |
410 do { | |
411 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
412 | |
413 if (width == 16) | |
414 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9); | |
415 | |
416 dest += stride; | |
417 ref += stride; | |
418 ref_next += stride; | |
419 } while (--height); | |
420 } | |
421 | |
9852 | 422 static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref, |
423 int stride, int height) | |
1 | 424 { |
425 MC_put_xy_mmx (16, height, dest, ref, stride); | |
426 } | |
427 | |
9852 | 428 static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref, |
429 int stride, int height) | |
1 | 430 { |
431 MC_put_xy_mmx (8, height, dest, ref, stride); | |
432 } | |
433 | |
36 | 434 /*-----------------------------------------------------------------------*/ |
1 | 435 |
9852 | 436 static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest, |
437 const uint8_t * ref, const int stride) | |
1 | 438 { |
9852 | 439 const uint8_t * ref_next = ref + stride; |
1 | 440 |
441 mmx_zero_reg (); | |
442 | |
443 do { | |
444 mmx_interp_average_2_U8 (dest, ref, ref_next); | |
445 | |
446 if (width == 16) | |
447 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8); | |
448 | |
449 dest += stride; | |
450 ref += stride; | |
451 ref_next += stride; | |
452 } while (--height); | |
453 } | |
454 | |
9852 | 455 static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref, |
456 int stride, int height) | |
1 | 457 { |
458 MC_avg_y_mmx (16, height, dest, ref, stride); | |
459 } | |
460 | |
9852 | 461 static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref, |
462 int stride, int height) | |
1 | 463 { |
464 MC_avg_y_mmx (8, height, dest, ref, stride); | |
465 } | |
466 | |
36 | 467 /*-----------------------------------------------------------------------*/ |
1 | 468 |
9852 | 469 static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest, |
470 const uint8_t * ref, const int stride) | |
1 | 471 { |
9852 | 472 const uint8_t * ref_next = ref + stride; |
1 | 473 |
474 mmx_zero_reg (); | |
475 | |
476 do { | |
477 mmx_average_2_U8 (dest, ref, ref_next); | |
478 | |
479 if (width == 16) | |
480 mmx_average_2_U8 (dest+8, ref+8, ref_next+8); | |
481 | |
482 dest += stride; | |
483 ref += stride; | |
484 ref_next += stride; | |
485 } while (--height); | |
486 } | |
487 | |
9852 | 488 static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref, |
489 int stride, int height) | |
1 | 490 { |
491 MC_put_y_mmx (16, height, dest, ref, stride); | |
492 } | |
493 | |
9852 | 494 static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref, |
495 int stride, int height) | |
1 | 496 { |
497 MC_put_y_mmx (8, height, dest, ref, stride); | |
498 } | |
499 | |
500 | |
9852 | 501 MPEG2_MC_EXTERN (mmx) |
1 | 502 |
503 | |
504 | |
505 | |
506 | |
507 | |
508 | |
36 | 509 /* CPU_MMXEXT/CPU_3DNOW adaptation layer */ |
1 | 510 |
511 #define pavg_r2r(src,dest) \ | |
512 do { \ | |
513 if (cpu == CPU_MMXEXT) \ | |
514 pavgb_r2r (src, dest); \ | |
515 else \ | |
516 pavgusb_r2r (src, dest); \ | |
517 } while (0) | |
518 | |
519 #define pavg_m2r(src,dest) \ | |
520 do { \ | |
521 if (cpu == CPU_MMXEXT) \ | |
522 pavgb_m2r (src, dest); \ | |
523 else \ | |
524 pavgusb_m2r (src, dest); \ | |
525 } while (0) | |
526 | |
527 | |
36 | 528 /* CPU_MMXEXT code */ |
1 | 529 |
530 | |
9852 | 531 static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref, |
532 const int stride) | |
1 | 533 { |
534 do { | |
535 movq_m2r (*ref, mm0); | |
536 movq_r2m (mm0, *dest); | |
537 ref += stride; | |
538 dest += stride; | |
539 } while (--height); | |
540 } | |
541 | |
9852 | 542 static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref, |
543 const int stride) | |
1 | 544 { |
545 do { | |
546 movq_m2r (*ref, mm0); | |
547 movq_m2r (*(ref+8), mm1); | |
548 ref += stride; | |
549 movq_r2m (mm0, *dest); | |
550 movq_r2m (mm1, *(dest+8)); | |
551 dest += stride; | |
552 } while (--height); | |
553 } | |
554 | |
9852 | 555 static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref, |
556 const int stride, const int cpu) | |
1 | 557 { |
558 do { | |
559 movq_m2r (*ref, mm0); | |
560 pavg_m2r (*dest, mm0); | |
561 ref += stride; | |
562 movq_r2m (mm0, *dest); | |
563 dest += stride; | |
564 } while (--height); | |
565 } | |
566 | |
9852 | 567 static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref, |
568 const int stride, const int cpu) | |
1 | 569 { |
570 do { | |
571 movq_m2r (*ref, mm0); | |
572 movq_m2r (*(ref+8), mm1); | |
573 pavg_m2r (*dest, mm0); | |
574 pavg_m2r (*(dest+8), mm1); | |
575 movq_r2m (mm0, *dest); | |
576 ref += stride; | |
577 movq_r2m (mm1, *(dest+8)); | |
578 dest += stride; | |
579 } while (--height); | |
580 } | |
581 | |
9852 | 582 static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref, |
583 const int stride, const int offset, | |
584 const int cpu) | |
1 | 585 { |
586 do { | |
587 movq_m2r (*ref, mm0); | |
588 pavg_m2r (*(ref+offset), mm0); | |
589 ref += stride; | |
590 movq_r2m (mm0, *dest); | |
591 dest += stride; | |
592 } while (--height); | |
593 } | |
594 | |
9852 | 595 static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref, |
596 const int stride, const int offset, | |
597 const int cpu) | |
1 | 598 { |
599 do { | |
600 movq_m2r (*ref, mm0); | |
601 movq_m2r (*(ref+8), mm1); | |
602 pavg_m2r (*(ref+offset), mm0); | |
603 pavg_m2r (*(ref+offset+8), mm1); | |
604 movq_r2m (mm0, *dest); | |
605 ref += stride; | |
606 movq_r2m (mm1, *(dest+8)); | |
607 dest += stride; | |
608 } while (--height); | |
609 } | |
610 | |
9852 | 611 static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref, |
612 const int stride, const int offset, | |
613 const int cpu) | |
1 | 614 { |
615 do { | |
616 movq_m2r (*ref, mm0); | |
617 pavg_m2r (*(ref+offset), mm0); | |
618 pavg_m2r (*dest, mm0); | |
619 ref += stride; | |
620 movq_r2m (mm0, *dest); | |
621 dest += stride; | |
622 } while (--height); | |
623 } | |
624 | |
9852 | 625 static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref, |
626 const int stride, const int offset, | |
627 const int cpu) | |
1 | 628 { |
629 do { | |
630 movq_m2r (*ref, mm0); | |
631 movq_m2r (*(ref+8), mm1); | |
632 pavg_m2r (*(ref+offset), mm0); | |
633 pavg_m2r (*(ref+offset+8), mm1); | |
634 pavg_m2r (*dest, mm0); | |
635 pavg_m2r (*(dest+8), mm1); | |
636 ref += stride; | |
637 movq_r2m (mm0, *dest); | |
638 movq_r2m (mm1, *(dest+8)); | |
639 dest += stride; | |
640 } while (--height); | |
641 } | |
642 | |
643 static mmx_t mask_one = {0x0101010101010101LL}; | |
644 | |
9852 | 645 static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref, |
646 const int stride, const int cpu) | |
1 | 647 { |
648 movq_m2r (*ref, mm0); | |
649 movq_m2r (*(ref+1), mm1); | |
650 movq_r2r (mm0, mm7); | |
651 pxor_r2r (mm1, mm7); | |
652 pavg_r2r (mm1, mm0); | |
653 ref += stride; | |
654 | |
655 do { | |
656 movq_m2r (*ref, mm2); | |
657 movq_r2r (mm0, mm5); | |
658 | |
659 movq_m2r (*(ref+1), mm3); | |
660 movq_r2r (mm2, mm6); | |
661 | |
662 pxor_r2r (mm3, mm6); | |
663 pavg_r2r (mm3, mm2); | |
664 | |
665 por_r2r (mm6, mm7); | |
666 pxor_r2r (mm2, mm5); | |
667 | |
668 pand_r2r (mm5, mm7); | |
669 pavg_r2r (mm2, mm0); | |
670 | |
671 pand_m2r (mask_one, mm7); | |
672 | |
673 psubusb_r2r (mm7, mm0); | |
674 | |
675 ref += stride; | |
676 movq_r2m (mm0, *dest); | |
677 dest += stride; | |
678 | |
9852 | 679 movq_r2r (mm6, mm7); /* unroll ! */ |
680 movq_r2r (mm2, mm0); /* unroll ! */ | |
1 | 681 } while (--height); |
682 } | |
683 | |
9852 | 684 static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref, |
685 const int stride, const int cpu) | |
1 | 686 { |
687 do { | |
688 movq_m2r (*ref, mm0); | |
689 movq_m2r (*(ref+stride+1), mm1); | |
690 movq_r2r (mm0, mm7); | |
691 movq_m2r (*(ref+1), mm2); | |
692 pxor_r2r (mm1, mm7); | |
693 movq_m2r (*(ref+stride), mm3); | |
694 movq_r2r (mm2, mm6); | |
695 pxor_r2r (mm3, mm6); | |
696 pavg_r2r (mm1, mm0); | |
697 pavg_r2r (mm3, mm2); | |
698 por_r2r (mm6, mm7); | |
699 movq_r2r (mm0, mm6); | |
700 pxor_r2r (mm2, mm6); | |
701 pand_r2r (mm6, mm7); | |
702 pand_m2r (mask_one, mm7); | |
703 pavg_r2r (mm2, mm0); | |
704 psubusb_r2r (mm7, mm0); | |
705 movq_r2m (mm0, *dest); | |
706 | |
707 movq_m2r (*(ref+8), mm0); | |
708 movq_m2r (*(ref+stride+9), mm1); | |
709 movq_r2r (mm0, mm7); | |
710 movq_m2r (*(ref+9), mm2); | |
711 pxor_r2r (mm1, mm7); | |
712 movq_m2r (*(ref+stride+8), mm3); | |
713 movq_r2r (mm2, mm6); | |
714 pxor_r2r (mm3, mm6); | |
715 pavg_r2r (mm1, mm0); | |
716 pavg_r2r (mm3, mm2); | |
717 por_r2r (mm6, mm7); | |
718 movq_r2r (mm0, mm6); | |
719 pxor_r2r (mm2, mm6); | |
720 pand_r2r (mm6, mm7); | |
721 pand_m2r (mask_one, mm7); | |
722 pavg_r2r (mm2, mm0); | |
723 psubusb_r2r (mm7, mm0); | |
724 ref += stride; | |
725 movq_r2m (mm0, *(dest+8)); | |
726 dest += stride; | |
727 } while (--height); | |
728 } | |
729 | |
9852 | 730 static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref, |
731 const int stride, const int cpu) | |
1 | 732 { |
733 do { | |
734 movq_m2r (*ref, mm0); | |
735 movq_m2r (*(ref+stride+1), mm1); | |
736 movq_r2r (mm0, mm7); | |
737 movq_m2r (*(ref+1), mm2); | |
738 pxor_r2r (mm1, mm7); | |
739 movq_m2r (*(ref+stride), mm3); | |
740 movq_r2r (mm2, mm6); | |
741 pxor_r2r (mm3, mm6); | |
742 pavg_r2r (mm1, mm0); | |
743 pavg_r2r (mm3, mm2); | |
744 por_r2r (mm6, mm7); | |
745 movq_r2r (mm0, mm6); | |
746 pxor_r2r (mm2, mm6); | |
747 pand_r2r (mm6, mm7); | |
748 pand_m2r (mask_one, mm7); | |
749 pavg_r2r (mm2, mm0); | |
750 psubusb_r2r (mm7, mm0); | |
751 movq_m2r (*dest, mm1); | |
752 pavg_r2r (mm1, mm0); | |
753 ref += stride; | |
754 movq_r2m (mm0, *dest); | |
755 dest += stride; | |
756 } while (--height); | |
757 } | |
758 | |
9852 | 759 static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref, |
760 const int stride, const int cpu) | |
1 | 761 { |
762 do { | |
763 movq_m2r (*ref, mm0); | |
764 movq_m2r (*(ref+stride+1), mm1); | |
765 movq_r2r (mm0, mm7); | |
766 movq_m2r (*(ref+1), mm2); | |
767 pxor_r2r (mm1, mm7); | |
768 movq_m2r (*(ref+stride), mm3); | |
769 movq_r2r (mm2, mm6); | |
770 pxor_r2r (mm3, mm6); | |
771 pavg_r2r (mm1, mm0); | |
772 pavg_r2r (mm3, mm2); | |
773 por_r2r (mm6, mm7); | |
774 movq_r2r (mm0, mm6); | |
775 pxor_r2r (mm2, mm6); | |
776 pand_r2r (mm6, mm7); | |
777 pand_m2r (mask_one, mm7); | |
778 pavg_r2r (mm2, mm0); | |
779 psubusb_r2r (mm7, mm0); | |
780 movq_m2r (*dest, mm1); | |
781 pavg_r2r (mm1, mm0); | |
782 movq_r2m (mm0, *dest); | |
783 | |
784 movq_m2r (*(ref+8), mm0); | |
785 movq_m2r (*(ref+stride+9), mm1); | |
786 movq_r2r (mm0, mm7); | |
787 movq_m2r (*(ref+9), mm2); | |
788 pxor_r2r (mm1, mm7); | |
789 movq_m2r (*(ref+stride+8), mm3); | |
790 movq_r2r (mm2, mm6); | |
791 pxor_r2r (mm3, mm6); | |
792 pavg_r2r (mm1, mm0); | |
793 pavg_r2r (mm3, mm2); | |
794 por_r2r (mm6, mm7); | |
795 movq_r2r (mm0, mm6); | |
796 pxor_r2r (mm2, mm6); | |
797 pand_r2r (mm6, mm7); | |
798 pand_m2r (mask_one, mm7); | |
799 pavg_r2r (mm2, mm0); | |
800 psubusb_r2r (mm7, mm0); | |
801 movq_m2r (*(dest+8), mm1); | |
802 pavg_r2r (mm1, mm0); | |
803 ref += stride; | |
804 movq_r2m (mm0, *(dest+8)); | |
805 dest += stride; | |
806 } while (--height); | |
807 } | |
808 | |
9852 | 809 static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref, |
810 int stride, int height) | |
1 | 811 { |
812 MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT); | |
813 } | |
814 | |
9852 | 815 static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref, |
816 int stride, int height) | |
1 | 817 { |
818 MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT); | |
819 } | |
820 | |
9852 | 821 static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref, |
822 int stride, int height) | |
1 | 823 { |
824 MC_put1_16 (height, dest, ref, stride); | |
825 } | |
826 | |
9852 | 827 static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref, |
828 int stride, int height) | |
1 | 829 { |
830 MC_put1_8 (height, dest, ref, stride); | |
831 } | |
832 | |
9852 | 833 static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref, |
834 int stride, int height) | |
1 | 835 { |
836 MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
837 } | |
838 | |
9852 | 839 static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref, |
840 int stride, int height) | |
1 | 841 { |
842 MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
843 } | |
844 | |
9852 | 845 static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref, |
846 int stride, int height) | |
1 | 847 { |
848 MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
849 } | |
850 | |
9852 | 851 static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref, |
852 int stride, int height) | |
1 | 853 { |
854 MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
855 } | |
856 | |
9852 | 857 static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref, |
858 int stride, int height) | |
1 | 859 { |
860 MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
861 } | |
862 | |
9852 | 863 static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref, |
864 int stride, int height) | |
1 | 865 { |
866 MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
867 } | |
868 | |
9852 | 869 static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref, |
870 int stride, int height) | |
1 | 871 { |
872 MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
873 } | |
874 | |
9852 | 875 static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref, |
876 int stride, int height) | |
1 | 877 { |
878 MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
879 } | |
880 | |
9852 | 881 static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref, |
882 int stride, int height) | |
1 | 883 { |
884 MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
885 } | |
886 | |
9852 | 887 static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref, |
888 int stride, int height) | |
1 | 889 { |
890 MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
891 } | |
892 | |
9852 | 893 static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref, |
894 int stride, int height) | |
1 | 895 { |
896 MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
897 } | |
898 | |
9852 | 899 static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref, |
900 int stride, int height) | |
1 | 901 { |
902 MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
903 } | |
904 | |
905 | |
9852 | 906 MPEG2_MC_EXTERN (mmxext) |
1 | 907 |
908 | |
909 | |
9852 | 910 static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref, |
911 int stride, int height) | |
1 | 912 { |
913 MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW); | |
914 } | |
915 | |
9852 | 916 static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref, |
917 int stride, int height) | |
1 | 918 { |
919 MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW); | |
920 } | |
921 | |
9852 | 922 static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref, |
923 int stride, int height) | |
1 | 924 { |
925 MC_put1_16 (height, dest, ref, stride); | |
926 } | |
927 | |
9852 | 928 static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref, |
929 int stride, int height) | |
1 | 930 { |
931 MC_put1_8 (height, dest, ref, stride); | |
932 } | |
933 | |
9852 | 934 static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 935 int stride, int height) |
936 { | |
937 MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
938 } | |
939 | |
9852 | 940 static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 941 int stride, int height) |
942 { | |
943 MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
944 } | |
945 | |
9852 | 946 static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 947 int stride, int height) |
948 { | |
949 MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
950 } | |
951 | |
9852 | 952 static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 953 int stride, int height) |
954 { | |
955 MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
956 } | |
957 | |
9852 | 958 static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 959 int stride, int height) |
960 { | |
961 MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
962 } | |
963 | |
9852 | 964 static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 965 int stride, int height) |
966 { | |
967 MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
968 } | |
969 | |
9852 | 970 static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 971 int stride, int height) |
972 { | |
973 MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
974 } | |
975 | |
9852 | 976 static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 977 int stride, int height) |
978 { | |
979 MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
980 } | |
981 | |
9852 | 982 static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 983 int stride, int height) |
984 { | |
985 MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW); | |
986 } | |
987 | |
9852 | 988 static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 989 int stride, int height) |
990 { | |
991 MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW); | |
992 } | |
993 | |
9852 | 994 static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 995 int stride, int height) |
996 { | |
997 MC_put4_16 (height, dest, ref, stride, CPU_3DNOW); | |
998 } | |
999 | |
9852 | 1000 static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 1001 int stride, int height) |
1002 { | |
1003 MC_put4_8 (height, dest, ref, stride, CPU_3DNOW); | |
1004 } | |
1005 | |
1006 | |
9852 | 1007 MPEG2_MC_EXTERN (3dnow) |
1 | 1008 |
1009 #endif |