Mercurial > mplayer.hg
annotate libmpeg2/motion_comp_mmx.c @ 31268:191f5098cfe6
Some more Changelog updates.
author | reimar |
---|---|
date | Sun, 06 Jun 2010 08:32:17 +0000 |
parents | e41a2492e665 |
children |
rev | line source |
---|---|
1 | 1 /* |
2 * motion_comp_mmx.c | |
10303 | 3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> |
9852 | 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
1 | 5 * |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9852 | 7 * See http://libmpeg2.sourceforge.net/ for updates. |
1 | 8 * |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
22 */ | |
23 | |
24 #include "config.h" | |
25 | |
28290 | 26 #if ARCH_X86 || ARCH_X86_64 |
1 | 27 |
28 #include <inttypes.h> | |
29 | |
9852 | 30 #include "mpeg2.h" |
12932 | 31 #include "attributes.h" |
1 | 32 #include "mpeg2_internal.h" |
33 #include "mmx.h" | |
34 | |
35 #define CPU_MMXEXT 0 | |
36 #define CPU_3DNOW 1 | |
37 | |
38 | |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
39 #if HAVE_MMX |
36 | 40 /* MMX code - needs a rewrite */ |
1 | 41 |
9852 | 42 /* |
43 * Motion Compensation frequently needs to average values using the | |
44 * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction | |
45 * to compute this, but it's been left out of classic MMX. | |
46 * | |
47 * We need to be careful of overflows when doing this computation. | |
48 * Rather than unpacking data to 16-bits, which reduces parallelism, | |
49 * we use the following formulas: | |
50 * | |
51 * (x+y)>>1 == (x&y)+((x^y)>>1) | |
52 * (x+y+1)>>1 == (x|y)-((x^y)>>1) | |
53 */ | |
1 | 54 |
36 | 55 /* some rounding constants */ |
9852 | 56 static mmx_t mask1 = {0xfefefefefefefefeLL}; |
57 static mmx_t round4 = {0x0002000200020002LL}; | |
1 | 58 |
59 /* | |
60 * This code should probably be compiled with loop unrolling | |
61 * (ie, -funroll-loops in gcc)becuase some of the loops | |
62 * use a small static number of iterations. This was written | |
63 * with the assumption the compiler knows best about when | |
64 * unrolling will help | |
65 */ | |
66 | |
27572 | 67 static inline void mmx_zero_reg (void) |
1 | 68 { |
36 | 69 /* load 0 into mm0 */ |
1 | 70 pxor_r2r (mm0, mm0); |
71 } | |
72 | |
9852 | 73 static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1, |
74 const uint8_t * src2) | |
1 | 75 { |
36 | 76 /* *dest = (*src1 + *src2 + 1)/ 2; */ |
1 | 77 |
9852 | 78 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
79 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 80 |
9852 | 81 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
82 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 83 |
9852 | 84 pxor_r2r (mm1, mm3); /* xor src1 and src2 */ |
85 pand_m2r (mask1, mm3); /* mask lower bits */ | |
86 psrlq_i2r (1, mm3); /* /2 */ | |
87 por_r2r (mm2, mm4); /* or src1 and src2 */ | |
88 psubb_r2r (mm3, mm4); /* subtract subresults */ | |
89 movq_r2m (mm4, *dest); /* store result in dest */ | |
1 | 90 } |
91 | |
92 static inline void mmx_interp_average_2_U8 (uint8_t * dest, | |
9852 | 93 const uint8_t * src1, |
94 const uint8_t * src2) | |
1 | 95 { |
36 | 96 /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ |
1 | 97 |
9852 | 98 movq_m2r (*dest, mm1); /* load 8 dest bytes */ |
99 movq_r2r (mm1, mm2); /* copy 8 dest bytes */ | |
1 | 100 |
9852 | 101 movq_m2r (*src1, mm3); /* load 8 src1 bytes */ |
102 movq_r2r (mm3, mm4); /* copy 8 src1 bytes */ | |
1 | 103 |
9852 | 104 movq_m2r (*src2, mm5); /* load 8 src2 bytes */ |
105 movq_r2r (mm5, mm6); /* copy 8 src2 bytes */ | |
1 | 106 |
9852 | 107 pxor_r2r (mm3, mm5); /* xor src1 and src2 */ |
108 pand_m2r (mask1, mm5); /* mask lower bits */ | |
109 psrlq_i2r (1, mm5); /* /2 */ | |
110 por_r2r (mm4, mm6); /* or src1 and src2 */ | |
111 psubb_r2r (mm5, mm6); /* subtract subresults */ | |
112 movq_r2r (mm6, mm5); /* copy subresult */ | |
1 | 113 |
9852 | 114 pxor_r2r (mm1, mm5); /* xor srcavg and dest */ |
115 pand_m2r (mask1, mm5); /* mask lower bits */ | |
116 psrlq_i2r (1, mm5); /* /2 */ | |
117 por_r2r (mm2, mm6); /* or srcavg and dest */ | |
118 psubb_r2r (mm5, mm6); /* subtract subresults */ | |
119 movq_r2m (mm6, *dest); /* store result in dest */ | |
1 | 120 } |
121 | |
9852 | 122 static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1, |
123 const uint8_t * src2, | |
124 const uint8_t * src3, | |
125 const uint8_t * src4) | |
1 | 126 { |
36 | 127 /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */ |
1 | 128 |
9852 | 129 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
130 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 131 |
9852 | 132 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ |
133 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ | |
1 | 134 |
9852 | 135 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
136 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 137 |
9852 | 138 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ |
139 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ | |
1 | 140 |
9852 | 141 paddw_r2r (mm3, mm1); /* add lows */ |
142 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 143 |
36 | 144 /* now have partials in mm1 and mm2 */ |
1 | 145 |
9852 | 146 movq_m2r (*src3, mm3); /* load 8 src3 bytes */ |
147 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ | |
1 | 148 |
9852 | 149 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ |
150 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ | |
1 | 151 |
9852 | 152 paddw_r2r (mm3, mm1); /* add lows */ |
153 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 154 |
9852 | 155 movq_m2r (*src4, mm5); /* load 8 src4 bytes */ |
156 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ | |
1 | 157 |
9852 | 158 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ |
159 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ | |
1 | 160 |
9852 | 161 paddw_r2r (mm5, mm1); /* add lows */ |
162 paddw_r2r (mm6, mm2); /* add highs */ | |
1 | 163 |
36 | 164 /* now have subtotal in mm1 and mm2 */ |
1 | 165 |
166 paddw_m2r (round4, mm1); | |
9852 | 167 psraw_i2r (2, mm1); /* /4 */ |
1 | 168 paddw_m2r (round4, mm2); |
9852 | 169 psraw_i2r (2, mm2); /* /4 */ |
1 | 170 |
9852 | 171 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ |
172 movq_r2m (mm1, *dest); /* store result in dest */ | |
1 | 173 } |
174 | |
175 static inline void mmx_interp_average_4_U8 (uint8_t * dest, | |
9852 | 176 const uint8_t * src1, |
177 const uint8_t * src2, | |
178 const uint8_t * src3, | |
179 const uint8_t * src4) | |
1 | 180 { |
36 | 181 /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */ |
1 | 182 |
9852 | 183 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
184 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 185 |
9852 | 186 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ |
187 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ | |
1 | 188 |
9852 | 189 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
190 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 191 |
9852 | 192 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ |
193 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ | |
1 | 194 |
9852 | 195 paddw_r2r (mm3, mm1); /* add lows */ |
196 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 197 |
36 | 198 /* now have partials in mm1 and mm2 */ |
1 | 199 |
9852 | 200 movq_m2r (*src3, mm3); /* load 8 src3 bytes */ |
201 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ | |
1 | 202 |
9852 | 203 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ |
204 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ | |
1 | 205 |
9852 | 206 paddw_r2r (mm3, mm1); /* add lows */ |
207 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 208 |
9852 | 209 movq_m2r (*src4, mm5); /* load 8 src4 bytes */ |
210 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ | |
1 | 211 |
9852 | 212 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ |
213 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ | |
1 | 214 |
9852 | 215 paddw_r2r (mm5, mm1); /* add lows */ |
216 paddw_r2r (mm6, mm2); /* add highs */ | |
1 | 217 |
218 paddw_m2r (round4, mm1); | |
9852 | 219 psraw_i2r (2, mm1); /* /4 */ |
1 | 220 paddw_m2r (round4, mm2); |
9852 | 221 psraw_i2r (2, mm2); /* /4 */ |
1 | 222 |
36 | 223 /* now have subtotal/4 in mm1 and mm2 */ |
1 | 224 |
9852 | 225 movq_m2r (*dest, mm3); /* load 8 dest bytes */ |
226 movq_r2r (mm3, mm4); /* copy 8 dest bytes */ | |
1 | 227 |
9852 | 228 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ |
229 movq_r2r (mm1,mm2); /* copy subresult */ | |
1 | 230 |
9852 | 231 pxor_r2r (mm1, mm3); /* xor srcavg and dest */ |
232 pand_m2r (mask1, mm3); /* mask lower bits */ | |
233 psrlq_i2r (1, mm3); /* /2 */ | |
234 por_r2r (mm2, mm4); /* or srcavg and dest */ | |
235 psubb_r2r (mm3, mm4); /* subtract subresults */ | |
236 movq_r2m (mm4, *dest); /* store result in dest */ | |
1 | 237 } |
238 | |
36 | 239 /*-----------------------------------------------------------------------*/ |
1 | 240 |
9852 | 241 static inline void MC_avg_mmx (const int width, int height, uint8_t * dest, |
242 const uint8_t * ref, const int stride) | |
1 | 243 { |
244 mmx_zero_reg (); | |
245 | |
246 do { | |
247 mmx_average_2_U8 (dest, dest, ref); | |
248 | |
249 if (width == 16) | |
250 mmx_average_2_U8 (dest+8, dest+8, ref+8); | |
251 | |
252 dest += stride; | |
253 ref += stride; | |
254 } while (--height); | |
255 } | |
256 | |
9852 | 257 static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref, |
258 int stride, int height) | |
1 | 259 { |
260 MC_avg_mmx (16, height, dest, ref, stride); | |
261 } | |
262 | |
9852 | 263 static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref, |
264 int stride, int height) | |
1 | 265 { |
266 MC_avg_mmx (8, height, dest, ref, stride); | |
267 } | |
268 | |
36 | 269 /*-----------------------------------------------------------------------*/ |
1 | 270 |
9852 | 271 static inline void MC_put_mmx (const int width, int height, uint8_t * dest, |
272 const uint8_t * ref, const int stride) | |
1 | 273 { |
274 mmx_zero_reg (); | |
275 | |
276 do { | |
9852 | 277 movq_m2r (* ref, mm1); /* load 8 ref bytes */ |
278 movq_r2m (mm1,* dest); /* store 8 bytes at curr */ | |
1 | 279 |
280 if (width == 16) | |
281 { | |
9852 | 282 movq_m2r (* (ref+8), mm1); /* load 8 ref bytes */ |
283 movq_r2m (mm1,* (dest+8)); /* store 8 bytes at curr */ | |
1 | 284 } |
285 | |
286 dest += stride; | |
287 ref += stride; | |
288 } while (--height); | |
289 } | |
290 | |
9852 | 291 static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref, |
292 int stride, int height) | |
1 | 293 { |
294 MC_put_mmx (16, height, dest, ref, stride); | |
295 } | |
296 | |
9852 | 297 static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref, |
298 int stride, int height) | |
1 | 299 { |
300 MC_put_mmx (8, height, dest, ref, stride); | |
301 } | |
302 | |
36 | 303 /*-----------------------------------------------------------------------*/ |
1 | 304 |
36 | 305 /* Half pixel interpolation in the x direction */ |
9852 | 306 static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest, |
307 const uint8_t * ref, const int stride) | |
1 | 308 { |
309 mmx_zero_reg (); | |
310 | |
311 do { | |
312 mmx_interp_average_2_U8 (dest, ref, ref+1); | |
313 | |
314 if (width == 16) | |
315 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9); | |
316 | |
317 dest += stride; | |
318 ref += stride; | |
319 } while (--height); | |
320 } | |
321 | |
9852 | 322 static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref, |
323 int stride, int height) | |
1 | 324 { |
325 MC_avg_x_mmx (16, height, dest, ref, stride); | |
326 } | |
327 | |
9852 | 328 static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref, |
329 int stride, int height) | |
1 | 330 { |
331 MC_avg_x_mmx (8, height, dest, ref, stride); | |
332 } | |
333 | |
36 | 334 /*-----------------------------------------------------------------------*/ |
1 | 335 |
9852 | 336 static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest, |
337 const uint8_t * ref, const int stride) | |
1 | 338 { |
339 mmx_zero_reg (); | |
340 | |
341 do { | |
342 mmx_average_2_U8 (dest, ref, ref+1); | |
343 | |
344 if (width == 16) | |
345 mmx_average_2_U8 (dest+8, ref+8, ref+9); | |
346 | |
347 dest += stride; | |
348 ref += stride; | |
349 } while (--height); | |
350 } | |
351 | |
9852 | 352 static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref, |
353 int stride, int height) | |
1 | 354 { |
355 MC_put_x_mmx (16, height, dest, ref, stride); | |
356 } | |
357 | |
9852 | 358 static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref, |
359 int stride, int height) | |
1 | 360 { |
361 MC_put_x_mmx (8, height, dest, ref, stride); | |
362 } | |
363 | |
36 | 364 /*-----------------------------------------------------------------------*/ |
1 | 365 |
9852 | 366 static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest, |
367 const uint8_t * ref, const int stride) | |
1 | 368 { |
9852 | 369 const uint8_t * ref_next = ref + stride; |
1 | 370 |
371 mmx_zero_reg (); | |
372 | |
373 do { | |
374 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
375 | |
376 if (width == 16) | |
377 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9, | |
378 ref_next+8, ref_next+9); | |
379 | |
380 dest += stride; | |
381 ref += stride; | |
382 ref_next += stride; | |
383 } while (--height); | |
384 } | |
385 | |
9852 | 386 static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref, |
387 int stride, int height) | |
1 | 388 { |
389 MC_avg_xy_mmx (16, height, dest, ref, stride); | |
390 } | |
391 | |
9852 | 392 static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref, |
393 int stride, int height) | |
1 | 394 { |
395 MC_avg_xy_mmx (8, height, dest, ref, stride); | |
396 } | |
397 | |
36 | 398 /*-----------------------------------------------------------------------*/ |
1 | 399 |
9852 | 400 static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest, |
401 const uint8_t * ref, const int stride) | |
1 | 402 { |
9852 | 403 const uint8_t * ref_next = ref + stride; |
1 | 404 |
405 mmx_zero_reg (); | |
406 | |
407 do { | |
408 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
409 | |
410 if (width == 16) | |
411 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9); | |
412 | |
413 dest += stride; | |
414 ref += stride; | |
415 ref_next += stride; | |
416 } while (--height); | |
417 } | |
418 | |
9852 | 419 static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref, |
420 int stride, int height) | |
1 | 421 { |
422 MC_put_xy_mmx (16, height, dest, ref, stride); | |
423 } | |
424 | |
9852 | 425 static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref, |
426 int stride, int height) | |
1 | 427 { |
428 MC_put_xy_mmx (8, height, dest, ref, stride); | |
429 } | |
430 | |
36 | 431 /*-----------------------------------------------------------------------*/ |
1 | 432 |
9852 | 433 static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest, |
434 const uint8_t * ref, const int stride) | |
1 | 435 { |
9852 | 436 const uint8_t * ref_next = ref + stride; |
1 | 437 |
438 mmx_zero_reg (); | |
439 | |
440 do { | |
441 mmx_interp_average_2_U8 (dest, ref, ref_next); | |
442 | |
443 if (width == 16) | |
444 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8); | |
445 | |
446 dest += stride; | |
447 ref += stride; | |
448 ref_next += stride; | |
449 } while (--height); | |
450 } | |
451 | |
9852 | 452 static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref, |
453 int stride, int height) | |
1 | 454 { |
455 MC_avg_y_mmx (16, height, dest, ref, stride); | |
456 } | |
457 | |
9852 | 458 static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref, |
459 int stride, int height) | |
1 | 460 { |
461 MC_avg_y_mmx (8, height, dest, ref, stride); | |
462 } | |
463 | |
36 | 464 /*-----------------------------------------------------------------------*/ |
1 | 465 |
9852 | 466 static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest, |
467 const uint8_t * ref, const int stride) | |
1 | 468 { |
9852 | 469 const uint8_t * ref_next = ref + stride; |
1 | 470 |
471 mmx_zero_reg (); | |
472 | |
473 do { | |
474 mmx_average_2_U8 (dest, ref, ref_next); | |
475 | |
476 if (width == 16) | |
477 mmx_average_2_U8 (dest+8, ref+8, ref_next+8); | |
478 | |
479 dest += stride; | |
480 ref += stride; | |
481 ref_next += stride; | |
482 } while (--height); | |
483 } | |
484 | |
9852 | 485 static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref, |
486 int stride, int height) | |
1 | 487 { |
488 MC_put_y_mmx (16, height, dest, ref, stride); | |
489 } | |
490 | |
9852 | 491 static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref, |
492 int stride, int height) | |
1 | 493 { |
494 MC_put_y_mmx (8, height, dest, ref, stride); | |
495 } | |
496 | |
497 | |
9852 | 498 MPEG2_MC_EXTERN (mmx) |
1 | 499 |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
500 #endif /* HAVE_MMX */ |
1 | 501 |
502 | |
503 | |
504 | |
505 | |
506 | |
36 | 507 /* CPU_MMXEXT/CPU_3DNOW adaptation layer */ |
1 | 508 |
509 #define pavg_r2r(src,dest) \ | |
510 do { \ | |
511 if (cpu == CPU_MMXEXT) \ | |
512 pavgb_r2r (src, dest); \ | |
513 else \ | |
514 pavgusb_r2r (src, dest); \ | |
515 } while (0) | |
516 | |
517 #define pavg_m2r(src,dest) \ | |
518 do { \ | |
519 if (cpu == CPU_MMXEXT) \ | |
520 pavgb_m2r (src, dest); \ | |
521 else \ | |
522 pavgusb_m2r (src, dest); \ | |
523 } while (0) | |
524 | |
525 | |
36 | 526 /* CPU_MMXEXT code */ |
1 | 527 |
528 | |
9852 | 529 static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref, |
530 const int stride) | |
1 | 531 { |
532 do { | |
533 movq_m2r (*ref, mm0); | |
534 movq_r2m (mm0, *dest); | |
535 ref += stride; | |
536 dest += stride; | |
537 } while (--height); | |
538 } | |
539 | |
9852 | 540 static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref, |
541 const int stride) | |
1 | 542 { |
543 do { | |
544 movq_m2r (*ref, mm0); | |
545 movq_m2r (*(ref+8), mm1); | |
546 ref += stride; | |
547 movq_r2m (mm0, *dest); | |
548 movq_r2m (mm1, *(dest+8)); | |
549 dest += stride; | |
550 } while (--height); | |
551 } | |
552 | |
9852 | 553 static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref, |
554 const int stride, const int cpu) | |
1 | 555 { |
556 do { | |
557 movq_m2r (*ref, mm0); | |
558 pavg_m2r (*dest, mm0); | |
559 ref += stride; | |
560 movq_r2m (mm0, *dest); | |
561 dest += stride; | |
562 } while (--height); | |
563 } | |
564 | |
9852 | 565 static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref, |
566 const int stride, const int cpu) | |
1 | 567 { |
568 do { | |
569 movq_m2r (*ref, mm0); | |
570 movq_m2r (*(ref+8), mm1); | |
571 pavg_m2r (*dest, mm0); | |
572 pavg_m2r (*(dest+8), mm1); | |
573 movq_r2m (mm0, *dest); | |
574 ref += stride; | |
575 movq_r2m (mm1, *(dest+8)); | |
576 dest += stride; | |
577 } while (--height); | |
578 } | |
579 | |
9852 | 580 static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref, |
581 const int stride, const int offset, | |
582 const int cpu) | |
1 | 583 { |
584 do { | |
585 movq_m2r (*ref, mm0); | |
586 pavg_m2r (*(ref+offset), mm0); | |
587 ref += stride; | |
588 movq_r2m (mm0, *dest); | |
589 dest += stride; | |
590 } while (--height); | |
591 } | |
592 | |
9852 | 593 static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref, |
594 const int stride, const int offset, | |
595 const int cpu) | |
1 | 596 { |
597 do { | |
598 movq_m2r (*ref, mm0); | |
599 movq_m2r (*(ref+8), mm1); | |
600 pavg_m2r (*(ref+offset), mm0); | |
601 pavg_m2r (*(ref+offset+8), mm1); | |
602 movq_r2m (mm0, *dest); | |
603 ref += stride; | |
604 movq_r2m (mm1, *(dest+8)); | |
605 dest += stride; | |
606 } while (--height); | |
607 } | |
608 | |
9852 | 609 static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref, |
610 const int stride, const int offset, | |
611 const int cpu) | |
1 | 612 { |
613 do { | |
614 movq_m2r (*ref, mm0); | |
615 pavg_m2r (*(ref+offset), mm0); | |
616 pavg_m2r (*dest, mm0); | |
617 ref += stride; | |
618 movq_r2m (mm0, *dest); | |
619 dest += stride; | |
620 } while (--height); | |
621 } | |
622 | |
9852 | 623 static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref, |
624 const int stride, const int offset, | |
625 const int cpu) | |
1 | 626 { |
627 do { | |
628 movq_m2r (*ref, mm0); | |
629 movq_m2r (*(ref+8), mm1); | |
630 pavg_m2r (*(ref+offset), mm0); | |
631 pavg_m2r (*(ref+offset+8), mm1); | |
632 pavg_m2r (*dest, mm0); | |
633 pavg_m2r (*(dest+8), mm1); | |
634 ref += stride; | |
635 movq_r2m (mm0, *dest); | |
636 movq_r2m (mm1, *(dest+8)); | |
637 dest += stride; | |
638 } while (--height); | |
639 } | |
640 | |
641 static mmx_t mask_one = {0x0101010101010101LL}; | |
642 | |
9852 | 643 static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref, |
644 const int stride, const int cpu) | |
1 | 645 { |
646 movq_m2r (*ref, mm0); | |
647 movq_m2r (*(ref+1), mm1); | |
648 movq_r2r (mm0, mm7); | |
649 pxor_r2r (mm1, mm7); | |
650 pavg_r2r (mm1, mm0); | |
651 ref += stride; | |
652 | |
653 do { | |
654 movq_m2r (*ref, mm2); | |
655 movq_r2r (mm0, mm5); | |
656 | |
657 movq_m2r (*(ref+1), mm3); | |
658 movq_r2r (mm2, mm6); | |
659 | |
660 pxor_r2r (mm3, mm6); | |
661 pavg_r2r (mm3, mm2); | |
662 | |
663 por_r2r (mm6, mm7); | |
664 pxor_r2r (mm2, mm5); | |
665 | |
666 pand_r2r (mm5, mm7); | |
667 pavg_r2r (mm2, mm0); | |
668 | |
669 pand_m2r (mask_one, mm7); | |
670 | |
671 psubusb_r2r (mm7, mm0); | |
672 | |
673 ref += stride; | |
674 movq_r2m (mm0, *dest); | |
675 dest += stride; | |
676 | |
9852 | 677 movq_r2r (mm6, mm7); /* unroll ! */ |
678 movq_r2r (mm2, mm0); /* unroll ! */ | |
1 | 679 } while (--height); |
680 } | |
681 | |
9852 | 682 static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref, |
683 const int stride, const int cpu) | |
1 | 684 { |
685 do { | |
686 movq_m2r (*ref, mm0); | |
687 movq_m2r (*(ref+stride+1), mm1); | |
688 movq_r2r (mm0, mm7); | |
689 movq_m2r (*(ref+1), mm2); | |
690 pxor_r2r (mm1, mm7); | |
691 movq_m2r (*(ref+stride), mm3); | |
692 movq_r2r (mm2, mm6); | |
693 pxor_r2r (mm3, mm6); | |
694 pavg_r2r (mm1, mm0); | |
695 pavg_r2r (mm3, mm2); | |
696 por_r2r (mm6, mm7); | |
697 movq_r2r (mm0, mm6); | |
698 pxor_r2r (mm2, mm6); | |
699 pand_r2r (mm6, mm7); | |
700 pand_m2r (mask_one, mm7); | |
701 pavg_r2r (mm2, mm0); | |
702 psubusb_r2r (mm7, mm0); | |
703 movq_r2m (mm0, *dest); | |
704 | |
705 movq_m2r (*(ref+8), mm0); | |
706 movq_m2r (*(ref+stride+9), mm1); | |
707 movq_r2r (mm0, mm7); | |
708 movq_m2r (*(ref+9), mm2); | |
709 pxor_r2r (mm1, mm7); | |
710 movq_m2r (*(ref+stride+8), mm3); | |
711 movq_r2r (mm2, mm6); | |
712 pxor_r2r (mm3, mm6); | |
713 pavg_r2r (mm1, mm0); | |
714 pavg_r2r (mm3, mm2); | |
715 por_r2r (mm6, mm7); | |
716 movq_r2r (mm0, mm6); | |
717 pxor_r2r (mm2, mm6); | |
718 pand_r2r (mm6, mm7); | |
719 pand_m2r (mask_one, mm7); | |
720 pavg_r2r (mm2, mm0); | |
721 psubusb_r2r (mm7, mm0); | |
722 ref += stride; | |
723 movq_r2m (mm0, *(dest+8)); | |
724 dest += stride; | |
725 } while (--height); | |
726 } | |
727 | |
9852 | 728 static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref, |
729 const int stride, const int cpu) | |
1 | 730 { |
731 do { | |
732 movq_m2r (*ref, mm0); | |
733 movq_m2r (*(ref+stride+1), mm1); | |
734 movq_r2r (mm0, mm7); | |
735 movq_m2r (*(ref+1), mm2); | |
736 pxor_r2r (mm1, mm7); | |
737 movq_m2r (*(ref+stride), mm3); | |
738 movq_r2r (mm2, mm6); | |
739 pxor_r2r (mm3, mm6); | |
740 pavg_r2r (mm1, mm0); | |
741 pavg_r2r (mm3, mm2); | |
742 por_r2r (mm6, mm7); | |
743 movq_r2r (mm0, mm6); | |
744 pxor_r2r (mm2, mm6); | |
745 pand_r2r (mm6, mm7); | |
746 pand_m2r (mask_one, mm7); | |
747 pavg_r2r (mm2, mm0); | |
748 psubusb_r2r (mm7, mm0); | |
749 movq_m2r (*dest, mm1); | |
750 pavg_r2r (mm1, mm0); | |
751 ref += stride; | |
752 movq_r2m (mm0, *dest); | |
753 dest += stride; | |
754 } while (--height); | |
755 } | |
756 | |
9852 | 757 static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref, |
758 const int stride, const int cpu) | |
1 | 759 { |
760 do { | |
761 movq_m2r (*ref, mm0); | |
762 movq_m2r (*(ref+stride+1), mm1); | |
763 movq_r2r (mm0, mm7); | |
764 movq_m2r (*(ref+1), mm2); | |
765 pxor_r2r (mm1, mm7); | |
766 movq_m2r (*(ref+stride), mm3); | |
767 movq_r2r (mm2, mm6); | |
768 pxor_r2r (mm3, mm6); | |
769 pavg_r2r (mm1, mm0); | |
770 pavg_r2r (mm3, mm2); | |
771 por_r2r (mm6, mm7); | |
772 movq_r2r (mm0, mm6); | |
773 pxor_r2r (mm2, mm6); | |
774 pand_r2r (mm6, mm7); | |
775 pand_m2r (mask_one, mm7); | |
776 pavg_r2r (mm2, mm0); | |
777 psubusb_r2r (mm7, mm0); | |
778 movq_m2r (*dest, mm1); | |
779 pavg_r2r (mm1, mm0); | |
780 movq_r2m (mm0, *dest); | |
781 | |
782 movq_m2r (*(ref+8), mm0); | |
783 movq_m2r (*(ref+stride+9), mm1); | |
784 movq_r2r (mm0, mm7); | |
785 movq_m2r (*(ref+9), mm2); | |
786 pxor_r2r (mm1, mm7); | |
787 movq_m2r (*(ref+stride+8), mm3); | |
788 movq_r2r (mm2, mm6); | |
789 pxor_r2r (mm3, mm6); | |
790 pavg_r2r (mm1, mm0); | |
791 pavg_r2r (mm3, mm2); | |
792 por_r2r (mm6, mm7); | |
793 movq_r2r (mm0, mm6); | |
794 pxor_r2r (mm2, mm6); | |
795 pand_r2r (mm6, mm7); | |
796 pand_m2r (mask_one, mm7); | |
797 pavg_r2r (mm2, mm0); | |
798 psubusb_r2r (mm7, mm0); | |
799 movq_m2r (*(dest+8), mm1); | |
800 pavg_r2r (mm1, mm0); | |
801 ref += stride; | |
802 movq_r2m (mm0, *(dest+8)); | |
803 dest += stride; | |
804 } while (--height); | |
805 } | |
806 | |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
807 #if HAVE_MMX2 |
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
808 |
9852 | 809 static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref, |
810 int stride, int height) | |
1 | 811 { |
812 MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT); | |
813 } | |
814 | |
9852 | 815 static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref, |
816 int stride, int height) | |
1 | 817 { |
818 MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT); | |
819 } | |
820 | |
9852 | 821 static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref, |
822 int stride, int height) | |
1 | 823 { |
824 MC_put1_16 (height, dest, ref, stride); | |
825 } | |
826 | |
9852 | 827 static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref, |
828 int stride, int height) | |
1 | 829 { |
830 MC_put1_8 (height, dest, ref, stride); | |
831 } | |
832 | |
9852 | 833 static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref, |
834 int stride, int height) | |
1 | 835 { |
836 MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
837 } | |
838 | |
9852 | 839 static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref, |
840 int stride, int height) | |
1 | 841 { |
842 MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
843 } | |
844 | |
9852 | 845 static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref, |
846 int stride, int height) | |
1 | 847 { |
848 MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
849 } | |
850 | |
9852 | 851 static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref, |
852 int stride, int height) | |
1 | 853 { |
854 MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
855 } | |
856 | |
9852 | 857 static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref, |
858 int stride, int height) | |
1 | 859 { |
860 MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
861 } | |
862 | |
9852 | 863 static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref, |
864 int stride, int height) | |
1 | 865 { |
866 MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
867 } | |
868 | |
9852 | 869 static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref, |
870 int stride, int height) | |
1 | 871 { |
872 MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
873 } | |
874 | |
9852 | 875 static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref, |
876 int stride, int height) | |
1 | 877 { |
878 MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
879 } | |
880 | |
9852 | 881 static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref, |
882 int stride, int height) | |
1 | 883 { |
884 MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
885 } | |
886 | |
9852 | 887 static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref, |
888 int stride, int height) | |
1 | 889 { |
890 MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
891 } | |
892 | |
9852 | 893 static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref, |
894 int stride, int height) | |
1 | 895 { |
896 MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
897 } | |
898 | |
9852 | 899 static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref, |
900 int stride, int height) | |
1 | 901 { |
902 MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
903 } | |
904 | |
905 | |
9852 | 906 MPEG2_MC_EXTERN (mmxext) |
1 | 907 |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
908 #endif /* HAVE_MMX2 */ |
1 | 909 |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
910 #if HAVE_AMD3DNOW |
1 | 911 |
9852 | 912 static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref, |
913 int stride, int height) | |
1 | 914 { |
915 MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW); | |
916 } | |
917 | |
9852 | 918 static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref, |
919 int stride, int height) | |
1 | 920 { |
921 MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW); | |
922 } | |
923 | |
9852 | 924 static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref, |
925 int stride, int height) | |
1 | 926 { |
927 MC_put1_16 (height, dest, ref, stride); | |
928 } | |
929 | |
9852 | 930 static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref, |
931 int stride, int height) | |
1 | 932 { |
933 MC_put1_8 (height, dest, ref, stride); | |
934 } | |
935 | |
9852 | 936 static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 937 int stride, int height) |
938 { | |
939 MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
940 } | |
941 | |
9852 | 942 static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 943 int stride, int height) |
944 { | |
945 MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
946 } | |
947 | |
9852 | 948 static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 949 int stride, int height) |
950 { | |
951 MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
952 } | |
953 | |
9852 | 954 static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 955 int stride, int height) |
956 { | |
957 MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
958 } | |
959 | |
9852 | 960 static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 961 int stride, int height) |
962 { | |
963 MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
964 } | |
965 | |
9852 | 966 static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 967 int stride, int height) |
968 { | |
969 MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
970 } | |
971 | |
9852 | 972 static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 973 int stride, int height) |
974 { | |
975 MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
976 } | |
977 | |
9852 | 978 static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 979 int stride, int height) |
980 { | |
981 MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
982 } | |
983 | |
9852 | 984 static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 985 int stride, int height) |
986 { | |
987 MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW); | |
988 } | |
989 | |
9852 | 990 static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 991 int stride, int height) |
992 { | |
993 MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW); | |
994 } | |
995 | |
9852 | 996 static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 997 int stride, int height) |
998 { | |
999 MC_put4_16 (height, dest, ref, stride, CPU_3DNOW); | |
1000 } | |
1001 | |
9852 | 1002 static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 1003 int stride, int height) |
1004 { | |
1005 MC_put4_8 (height, dest, ref, stride, CPU_3DNOW); | |
1006 } | |
1007 | |
1008 | |
9852 | 1009 MPEG2_MC_EXTERN (3dnow) |
1 | 1010 |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
1011 #endif /* HAVE_AMD3DNOW */ |
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
1012 |
1 | 1013 #endif |