Mercurial > mplayer.hg
annotate libmpeg2/motion_comp_mmx.c @ 33517:850a3272e10d
Change code to allow STREAM_CTRL_GET_CURRENT_TIME with cache enabled.
Due to that time being from what is currently read into the cache it
is unfortunately somewhat inaccurate and unsmooth, however for streams
that do have stream timestamps it is till a lot better than going by
the demuxer alone.
In particular it fixes bug #1081, when starting a DVD with -chapter
following seeks would be relative to the start of the DVD instead
of the current position.
author | reimar |
---|---|
date | Sun, 12 Jun 2011 11:26:22 +0000 |
parents | e41a2492e665 |
children |
rev | line source |
---|---|
1 | 1 /* |
2 * motion_comp_mmx.c | |
10303 | 3 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org> |
9852 | 4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
1 | 5 * |
6 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | |
9852 | 7 * See http://libmpeg2.sourceforge.net/ for updates. |
1 | 8 * |
9 * mpeg2dec is free software; you can redistribute it and/or modify | |
10 * it under the terms of the GNU General Public License as published by | |
11 * the Free Software Foundation; either version 2 of the License, or | |
12 * (at your option) any later version. | |
13 * | |
14 * mpeg2dec is distributed in the hope that it will be useful, | |
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
17 * GNU General Public License for more details. | |
18 * | |
19 * You should have received a copy of the GNU General Public License | |
20 * along with this program; if not, write to the Free Software | |
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
22 */ | |
23 | |
24 #include "config.h" | |
25 | |
28290 | 26 #if ARCH_X86 || ARCH_X86_64 |
1 | 27 |
28 #include <inttypes.h> | |
29 | |
9852 | 30 #include "mpeg2.h" |
12932 | 31 #include "attributes.h" |
1 | 32 #include "mpeg2_internal.h" |
33 #include "mmx.h" | |
34 | |
35 #define CPU_MMXEXT 0 | |
36 #define CPU_3DNOW 1 | |
37 | |
38 | |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
39 #if HAVE_MMX |
36 | 40 /* MMX code - needs a rewrite */ |
1 | 41 |
9852 | 42 /* |
43 * Motion Compensation frequently needs to average values using the | |
44 * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction | |
45 * to compute this, but it's been left out of classic MMX. | |
46 * | |
47 * We need to be careful of overflows when doing this computation. | |
48 * Rather than unpacking data to 16-bits, which reduces parallelism, | |
49 * we use the following formulas: | |
50 * | |
51 * (x+y)>>1 == (x&y)+((x^y)>>1) | |
52 * (x+y+1)>>1 == (x|y)-((x^y)>>1) | |
53 */ | |
1 | 54 |
36 | 55 /* some rounding constants */ |
9852 | 56 static mmx_t mask1 = {0xfefefefefefefefeLL}; |
57 static mmx_t round4 = {0x0002000200020002LL}; | |
1 | 58 |
59 /* | |
60 * This code should probably be compiled with loop unrolling | |
61 * (ie, -funroll-loops in gcc)becuase some of the loops | |
62 * use a small static number of iterations. This was written | |
63 * with the assumption the compiler knows best about when | |
64 * unrolling will help | |
65 */ | |
66 | |
27572 | 67 static inline void mmx_zero_reg (void) |
1 | 68 { |
36 | 69 /* load 0 into mm0 */ |
1 | 70 pxor_r2r (mm0, mm0); |
71 } | |
72 | |
9852 | 73 static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1, |
74 const uint8_t * src2) | |
1 | 75 { |
36 | 76 /* *dest = (*src1 + *src2 + 1)/ 2; */ |
1 | 77 |
9852 | 78 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
79 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 80 |
9852 | 81 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
82 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 83 |
9852 | 84 pxor_r2r (mm1, mm3); /* xor src1 and src2 */ |
85 pand_m2r (mask1, mm3); /* mask lower bits */ | |
86 psrlq_i2r (1, mm3); /* /2 */ | |
87 por_r2r (mm2, mm4); /* or src1 and src2 */ | |
88 psubb_r2r (mm3, mm4); /* subtract subresults */ | |
89 movq_r2m (mm4, *dest); /* store result in dest */ | |
1 | 90 } |
91 | |
92 static inline void mmx_interp_average_2_U8 (uint8_t * dest, | |
9852 | 93 const uint8_t * src1, |
94 const uint8_t * src2) | |
1 | 95 { |
36 | 96 /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ |
1 | 97 |
9852 | 98 movq_m2r (*dest, mm1); /* load 8 dest bytes */ |
99 movq_r2r (mm1, mm2); /* copy 8 dest bytes */ | |
1 | 100 |
9852 | 101 movq_m2r (*src1, mm3); /* load 8 src1 bytes */ |
102 movq_r2r (mm3, mm4); /* copy 8 src1 bytes */ | |
1 | 103 |
9852 | 104 movq_m2r (*src2, mm5); /* load 8 src2 bytes */ |
105 movq_r2r (mm5, mm6); /* copy 8 src2 bytes */ | |
1 | 106 |
9852 | 107 pxor_r2r (mm3, mm5); /* xor src1 and src2 */ |
108 pand_m2r (mask1, mm5); /* mask lower bits */ | |
109 psrlq_i2r (1, mm5); /* /2 */ | |
110 por_r2r (mm4, mm6); /* or src1 and src2 */ | |
111 psubb_r2r (mm5, mm6); /* subtract subresults */ | |
112 movq_r2r (mm6, mm5); /* copy subresult */ | |
1 | 113 |
9852 | 114 pxor_r2r (mm1, mm5); /* xor srcavg and dest */ |
115 pand_m2r (mask1, mm5); /* mask lower bits */ | |
116 psrlq_i2r (1, mm5); /* /2 */ | |
117 por_r2r (mm2, mm6); /* or srcavg and dest */ | |
118 psubb_r2r (mm5, mm6); /* subtract subresults */ | |
119 movq_r2m (mm6, *dest); /* store result in dest */ | |
1 | 120 } |
121 | |
9852 | 122 static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1, |
123 const uint8_t * src2, | |
124 const uint8_t * src3, | |
125 const uint8_t * src4) | |
1 | 126 { |
36 | 127 /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */ |
1 | 128 |
9852 | 129 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
130 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 131 |
9852 | 132 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ |
133 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ | |
1 | 134 |
9852 | 135 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
136 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 137 |
9852 | 138 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ |
139 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ | |
1 | 140 |
9852 | 141 paddw_r2r (mm3, mm1); /* add lows */ |
142 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 143 |
36 | 144 /* now have partials in mm1 and mm2 */ |
1 | 145 |
9852 | 146 movq_m2r (*src3, mm3); /* load 8 src3 bytes */ |
147 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ | |
1 | 148 |
9852 | 149 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ |
150 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ | |
1 | 151 |
9852 | 152 paddw_r2r (mm3, mm1); /* add lows */ |
153 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 154 |
9852 | 155 movq_m2r (*src4, mm5); /* load 8 src4 bytes */ |
156 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ | |
1 | 157 |
9852 | 158 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ |
159 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ | |
1 | 160 |
9852 | 161 paddw_r2r (mm5, mm1); /* add lows */ |
162 paddw_r2r (mm6, mm2); /* add highs */ | |
1 | 163 |
36 | 164 /* now have subtotal in mm1 and mm2 */ |
1 | 165 |
166 paddw_m2r (round4, mm1); | |
9852 | 167 psraw_i2r (2, mm1); /* /4 */ |
1 | 168 paddw_m2r (round4, mm2); |
9852 | 169 psraw_i2r (2, mm2); /* /4 */ |
1 | 170 |
9852 | 171 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ |
172 movq_r2m (mm1, *dest); /* store result in dest */ | |
1 | 173 } |
174 | |
175 static inline void mmx_interp_average_4_U8 (uint8_t * dest, | |
9852 | 176 const uint8_t * src1, |
177 const uint8_t * src2, | |
178 const uint8_t * src3, | |
179 const uint8_t * src4) | |
1 | 180 { |
36 | 181 /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */ |
1 | 182 |
9852 | 183 movq_m2r (*src1, mm1); /* load 8 src1 bytes */ |
184 movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ | |
1 | 185 |
9852 | 186 punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ |
187 punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ | |
1 | 188 |
9852 | 189 movq_m2r (*src2, mm3); /* load 8 src2 bytes */ |
190 movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ | |
1 | 191 |
9852 | 192 punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ |
193 punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ | |
1 | 194 |
9852 | 195 paddw_r2r (mm3, mm1); /* add lows */ |
196 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 197 |
36 | 198 /* now have partials in mm1 and mm2 */ |
1 | 199 |
9852 | 200 movq_m2r (*src3, mm3); /* load 8 src3 bytes */ |
201 movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ | |
1 | 202 |
9852 | 203 punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ |
204 punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ | |
1 | 205 |
9852 | 206 paddw_r2r (mm3, mm1); /* add lows */ |
207 paddw_r2r (mm4, mm2); /* add highs */ | |
1 | 208 |
9852 | 209 movq_m2r (*src4, mm5); /* load 8 src4 bytes */ |
210 movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ | |
1 | 211 |
9852 | 212 punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ |
213 punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ | |
1 | 214 |
9852 | 215 paddw_r2r (mm5, mm1); /* add lows */ |
216 paddw_r2r (mm6, mm2); /* add highs */ | |
1 | 217 |
218 paddw_m2r (round4, mm1); | |
9852 | 219 psraw_i2r (2, mm1); /* /4 */ |
1 | 220 paddw_m2r (round4, mm2); |
9852 | 221 psraw_i2r (2, mm2); /* /4 */ |
1 | 222 |
36 | 223 /* now have subtotal/4 in mm1 and mm2 */ |
1 | 224 |
9852 | 225 movq_m2r (*dest, mm3); /* load 8 dest bytes */ |
226 movq_r2r (mm3, mm4); /* copy 8 dest bytes */ | |
1 | 227 |
9852 | 228 packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ |
229 movq_r2r (mm1,mm2); /* copy subresult */ | |
1 | 230 |
9852 | 231 pxor_r2r (mm1, mm3); /* xor srcavg and dest */ |
232 pand_m2r (mask1, mm3); /* mask lower bits */ | |
233 psrlq_i2r (1, mm3); /* /2 */ | |
234 por_r2r (mm2, mm4); /* or srcavg and dest */ | |
235 psubb_r2r (mm3, mm4); /* subtract subresults */ | |
236 movq_r2m (mm4, *dest); /* store result in dest */ | |
1 | 237 } |
238 | |
36 | 239 /*-----------------------------------------------------------------------*/ |
1 | 240 |
9852 | 241 static inline void MC_avg_mmx (const int width, int height, uint8_t * dest, |
242 const uint8_t * ref, const int stride) | |
1 | 243 { |
244 mmx_zero_reg (); | |
245 | |
246 do { | |
247 mmx_average_2_U8 (dest, dest, ref); | |
248 | |
249 if (width == 16) | |
250 mmx_average_2_U8 (dest+8, dest+8, ref+8); | |
251 | |
252 dest += stride; | |
253 ref += stride; | |
254 } while (--height); | |
255 } | |
256 | |
9852 | 257 static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref, |
258 int stride, int height) | |
1 | 259 { |
260 MC_avg_mmx (16, height, dest, ref, stride); | |
261 } | |
262 | |
9852 | 263 static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref, |
264 int stride, int height) | |
1 | 265 { |
266 MC_avg_mmx (8, height, dest, ref, stride); | |
267 } | |
268 | |
36 | 269 /*-----------------------------------------------------------------------*/ |
1 | 270 |
9852 | 271 static inline void MC_put_mmx (const int width, int height, uint8_t * dest, |
272 const uint8_t * ref, const int stride) | |
1 | 273 { |
274 mmx_zero_reg (); | |
275 | |
276 do { | |
9852 | 277 movq_m2r (* ref, mm1); /* load 8 ref bytes */ |
278 movq_r2m (mm1,* dest); /* store 8 bytes at curr */ | |
1 | 279 |
280 if (width == 16) | |
281 { | |
9852 | 282 movq_m2r (* (ref+8), mm1); /* load 8 ref bytes */ |
283 movq_r2m (mm1,* (dest+8)); /* store 8 bytes at curr */ | |
1 | 284 } |
285 | |
286 dest += stride; | |
287 ref += stride; | |
288 } while (--height); | |
289 } | |
290 | |
9852 | 291 static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref, |
292 int stride, int height) | |
1 | 293 { |
294 MC_put_mmx (16, height, dest, ref, stride); | |
295 } | |
296 | |
9852 | 297 static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref, |
298 int stride, int height) | |
1 | 299 { |
300 MC_put_mmx (8, height, dest, ref, stride); | |
301 } | |
302 | |
36 | 303 /*-----------------------------------------------------------------------*/ |
1 | 304 |
36 | 305 /* Half pixel interpolation in the x direction */ |
9852 | 306 static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest, |
307 const uint8_t * ref, const int stride) | |
1 | 308 { |
309 mmx_zero_reg (); | |
310 | |
311 do { | |
312 mmx_interp_average_2_U8 (dest, ref, ref+1); | |
313 | |
314 if (width == 16) | |
315 mmx_interp_average_2_U8 (dest+8, ref+8, ref+9); | |
316 | |
317 dest += stride; | |
318 ref += stride; | |
319 } while (--height); | |
320 } | |
321 | |
9852 | 322 static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref, |
323 int stride, int height) | |
1 | 324 { |
325 MC_avg_x_mmx (16, height, dest, ref, stride); | |
326 } | |
327 | |
9852 | 328 static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref, |
329 int stride, int height) | |
1 | 330 { |
331 MC_avg_x_mmx (8, height, dest, ref, stride); | |
332 } | |
333 | |
36 | 334 /*-----------------------------------------------------------------------*/ |
1 | 335 |
9852 | 336 static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest, |
337 const uint8_t * ref, const int stride) | |
1 | 338 { |
339 mmx_zero_reg (); | |
340 | |
341 do { | |
342 mmx_average_2_U8 (dest, ref, ref+1); | |
343 | |
344 if (width == 16) | |
345 mmx_average_2_U8 (dest+8, ref+8, ref+9); | |
346 | |
347 dest += stride; | |
348 ref += stride; | |
349 } while (--height); | |
350 } | |
351 | |
9852 | 352 static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref, |
353 int stride, int height) | |
1 | 354 { |
355 MC_put_x_mmx (16, height, dest, ref, stride); | |
356 } | |
357 | |
9852 | 358 static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref, |
359 int stride, int height) | |
1 | 360 { |
361 MC_put_x_mmx (8, height, dest, ref, stride); | |
362 } | |
363 | |
36 | 364 /*-----------------------------------------------------------------------*/ |
1 | 365 |
9852 | 366 static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest, |
367 const uint8_t * ref, const int stride) | |
1 | 368 { |
9852 | 369 const uint8_t * ref_next = ref + stride; |
1 | 370 |
371 mmx_zero_reg (); | |
372 | |
373 do { | |
374 mmx_interp_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
375 | |
376 if (width == 16) | |
377 mmx_interp_average_4_U8 (dest+8, ref+8, ref+9, | |
378 ref_next+8, ref_next+9); | |
379 | |
380 dest += stride; | |
381 ref += stride; | |
382 ref_next += stride; | |
383 } while (--height); | |
384 } | |
385 | |
9852 | 386 static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref, |
387 int stride, int height) | |
1 | 388 { |
389 MC_avg_xy_mmx (16, height, dest, ref, stride); | |
390 } | |
391 | |
9852 | 392 static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref, |
393 int stride, int height) | |
1 | 394 { |
395 MC_avg_xy_mmx (8, height, dest, ref, stride); | |
396 } | |
397 | |
36 | 398 /*-----------------------------------------------------------------------*/ |
1 | 399 |
9852 | 400 static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest, |
401 const uint8_t * ref, const int stride) | |
1 | 402 { |
9852 | 403 const uint8_t * ref_next = ref + stride; |
1 | 404 |
405 mmx_zero_reg (); | |
406 | |
407 do { | |
408 mmx_average_4_U8 (dest, ref, ref+1, ref_next, ref_next+1); | |
409 | |
410 if (width == 16) | |
411 mmx_average_4_U8 (dest+8, ref+8, ref+9, ref_next+8, ref_next+9); | |
412 | |
413 dest += stride; | |
414 ref += stride; | |
415 ref_next += stride; | |
416 } while (--height); | |
417 } | |
418 | |
9852 | 419 static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref, |
420 int stride, int height) | |
1 | 421 { |
422 MC_put_xy_mmx (16, height, dest, ref, stride); | |
423 } | |
424 | |
9852 | 425 static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref, |
426 int stride, int height) | |
1 | 427 { |
428 MC_put_xy_mmx (8, height, dest, ref, stride); | |
429 } | |
430 | |
36 | 431 /*-----------------------------------------------------------------------*/ |
1 | 432 |
9852 | 433 static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest, |
434 const uint8_t * ref, const int stride) | |
1 | 435 { |
9852 | 436 const uint8_t * ref_next = ref + stride; |
1 | 437 |
438 mmx_zero_reg (); | |
439 | |
440 do { | |
441 mmx_interp_average_2_U8 (dest, ref, ref_next); | |
442 | |
443 if (width == 16) | |
444 mmx_interp_average_2_U8 (dest+8, ref+8, ref_next+8); | |
445 | |
446 dest += stride; | |
447 ref += stride; | |
448 ref_next += stride; | |
449 } while (--height); | |
450 } | |
451 | |
9852 | 452 static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref, |
453 int stride, int height) | |
1 | 454 { |
455 MC_avg_y_mmx (16, height, dest, ref, stride); | |
456 } | |
457 | |
9852 | 458 static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref, |
459 int stride, int height) | |
1 | 460 { |
461 MC_avg_y_mmx (8, height, dest, ref, stride); | |
462 } | |
463 | |
36 | 464 /*-----------------------------------------------------------------------*/ |
1 | 465 |
9852 | 466 static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest, |
467 const uint8_t * ref, const int stride) | |
1 | 468 { |
9852 | 469 const uint8_t * ref_next = ref + stride; |
1 | 470 |
471 mmx_zero_reg (); | |
472 | |
473 do { | |
474 mmx_average_2_U8 (dest, ref, ref_next); | |
475 | |
476 if (width == 16) | |
477 mmx_average_2_U8 (dest+8, ref+8, ref_next+8); | |
478 | |
479 dest += stride; | |
480 ref += stride; | |
481 ref_next += stride; | |
482 } while (--height); | |
483 } | |
484 | |
9852 | 485 static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref, |
486 int stride, int height) | |
1 | 487 { |
488 MC_put_y_mmx (16, height, dest, ref, stride); | |
489 } | |
490 | |
9852 | 491 static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref, |
492 int stride, int height) | |
1 | 493 { |
494 MC_put_y_mmx (8, height, dest, ref, stride); | |
495 } | |
496 | |
497 | |
9852 | 498 MPEG2_MC_EXTERN (mmx) |
1 | 499 |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
500 #endif /* HAVE_MMX */ |
1 | 501 |
502 | |
503 | |
504 | |
505 | |
506 | |
36 | 507 /* CPU_MMXEXT/CPU_3DNOW adaptation layer */ |
1 | 508 |
509 #define pavg_r2r(src,dest) \ | |
510 do { \ | |
511 if (cpu == CPU_MMXEXT) \ | |
512 pavgb_r2r (src, dest); \ | |
513 else \ | |
514 pavgusb_r2r (src, dest); \ | |
515 } while (0) | |
516 | |
517 #define pavg_m2r(src,dest) \ | |
518 do { \ | |
519 if (cpu == CPU_MMXEXT) \ | |
520 pavgb_m2r (src, dest); \ | |
521 else \ | |
522 pavgusb_m2r (src, dest); \ | |
523 } while (0) | |
524 | |
525 | |
36 | 526 /* CPU_MMXEXT code */ |
1 | 527 |
528 | |
9852 | 529 static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref, |
530 const int stride) | |
1 | 531 { |
532 do { | |
533 movq_m2r (*ref, mm0); | |
534 movq_r2m (mm0, *dest); | |
535 ref += stride; | |
536 dest += stride; | |
537 } while (--height); | |
538 } | |
539 | |
9852 | 540 static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref, |
541 const int stride) | |
1 | 542 { |
543 do { | |
544 movq_m2r (*ref, mm0); | |
545 movq_m2r (*(ref+8), mm1); | |
546 ref += stride; | |
547 movq_r2m (mm0, *dest); | |
548 movq_r2m (mm1, *(dest+8)); | |
549 dest += stride; | |
550 } while (--height); | |
551 } | |
552 | |
9852 | 553 static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref, |
554 const int stride, const int cpu) | |
1 | 555 { |
556 do { | |
557 movq_m2r (*ref, mm0); | |
558 pavg_m2r (*dest, mm0); | |
559 ref += stride; | |
560 movq_r2m (mm0, *dest); | |
561 dest += stride; | |
562 } while (--height); | |
563 } | |
564 | |
9852 | 565 static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref, |
566 const int stride, const int cpu) | |
1 | 567 { |
568 do { | |
569 movq_m2r (*ref, mm0); | |
570 movq_m2r (*(ref+8), mm1); | |
571 pavg_m2r (*dest, mm0); | |
572 pavg_m2r (*(dest+8), mm1); | |
573 movq_r2m (mm0, *dest); | |
574 ref += stride; | |
575 movq_r2m (mm1, *(dest+8)); | |
576 dest += stride; | |
577 } while (--height); | |
578 } | |
579 | |
9852 | 580 static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref, |
581 const int stride, const int offset, | |
582 const int cpu) | |
1 | 583 { |
584 do { | |
585 movq_m2r (*ref, mm0); | |
586 pavg_m2r (*(ref+offset), mm0); | |
587 ref += stride; | |
588 movq_r2m (mm0, *dest); | |
589 dest += stride; | |
590 } while (--height); | |
591 } | |
592 | |
9852 | 593 static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref, |
594 const int stride, const int offset, | |
595 const int cpu) | |
1 | 596 { |
597 do { | |
598 movq_m2r (*ref, mm0); | |
599 movq_m2r (*(ref+8), mm1); | |
600 pavg_m2r (*(ref+offset), mm0); | |
601 pavg_m2r (*(ref+offset+8), mm1); | |
602 movq_r2m (mm0, *dest); | |
603 ref += stride; | |
604 movq_r2m (mm1, *(dest+8)); | |
605 dest += stride; | |
606 } while (--height); | |
607 } | |
608 | |
9852 | 609 static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref, |
610 const int stride, const int offset, | |
611 const int cpu) | |
1 | 612 { |
613 do { | |
614 movq_m2r (*ref, mm0); | |
615 pavg_m2r (*(ref+offset), mm0); | |
616 pavg_m2r (*dest, mm0); | |
617 ref += stride; | |
618 movq_r2m (mm0, *dest); | |
619 dest += stride; | |
620 } while (--height); | |
621 } | |
622 | |
9852 | 623 static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref, |
624 const int stride, const int offset, | |
625 const int cpu) | |
1 | 626 { |
627 do { | |
628 movq_m2r (*ref, mm0); | |
629 movq_m2r (*(ref+8), mm1); | |
630 pavg_m2r (*(ref+offset), mm0); | |
631 pavg_m2r (*(ref+offset+8), mm1); | |
632 pavg_m2r (*dest, mm0); | |
633 pavg_m2r (*(dest+8), mm1); | |
634 ref += stride; | |
635 movq_r2m (mm0, *dest); | |
636 movq_r2m (mm1, *(dest+8)); | |
637 dest += stride; | |
638 } while (--height); | |
639 } | |
640 | |
641 static mmx_t mask_one = {0x0101010101010101LL}; | |
642 | |
9852 | 643 static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref, |
644 const int stride, const int cpu) | |
1 | 645 { |
646 movq_m2r (*ref, mm0); | |
647 movq_m2r (*(ref+1), mm1); | |
648 movq_r2r (mm0, mm7); | |
649 pxor_r2r (mm1, mm7); | |
650 pavg_r2r (mm1, mm0); | |
651 ref += stride; | |
652 | |
653 do { | |
654 movq_m2r (*ref, mm2); | |
655 movq_r2r (mm0, mm5); | |
656 | |
657 movq_m2r (*(ref+1), mm3); | |
658 movq_r2r (mm2, mm6); | |
659 | |
660 pxor_r2r (mm3, mm6); | |
661 pavg_r2r (mm3, mm2); | |
662 | |
663 por_r2r (mm6, mm7); | |
664 pxor_r2r (mm2, mm5); | |
665 | |
666 pand_r2r (mm5, mm7); | |
667 pavg_r2r (mm2, mm0); | |
668 | |
669 pand_m2r (mask_one, mm7); | |
670 | |
671 psubusb_r2r (mm7, mm0); | |
672 | |
673 ref += stride; | |
674 movq_r2m (mm0, *dest); | |
675 dest += stride; | |
676 | |
9852 | 677 movq_r2r (mm6, mm7); /* unroll ! */ |
678 movq_r2r (mm2, mm0); /* unroll ! */ | |
1 | 679 } while (--height); |
680 } | |
681 | |
9852 | 682 static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref, |
683 const int stride, const int cpu) | |
1 | 684 { |
685 do { | |
686 movq_m2r (*ref, mm0); | |
687 movq_m2r (*(ref+stride+1), mm1); | |
688 movq_r2r (mm0, mm7); | |
689 movq_m2r (*(ref+1), mm2); | |
690 pxor_r2r (mm1, mm7); | |
691 movq_m2r (*(ref+stride), mm3); | |
692 movq_r2r (mm2, mm6); | |
693 pxor_r2r (mm3, mm6); | |
694 pavg_r2r (mm1, mm0); | |
695 pavg_r2r (mm3, mm2); | |
696 por_r2r (mm6, mm7); | |
697 movq_r2r (mm0, mm6); | |
698 pxor_r2r (mm2, mm6); | |
699 pand_r2r (mm6, mm7); | |
700 pand_m2r (mask_one, mm7); | |
701 pavg_r2r (mm2, mm0); | |
702 psubusb_r2r (mm7, mm0); | |
703 movq_r2m (mm0, *dest); | |
704 | |
705 movq_m2r (*(ref+8), mm0); | |
706 movq_m2r (*(ref+stride+9), mm1); | |
707 movq_r2r (mm0, mm7); | |
708 movq_m2r (*(ref+9), mm2); | |
709 pxor_r2r (mm1, mm7); | |
710 movq_m2r (*(ref+stride+8), mm3); | |
711 movq_r2r (mm2, mm6); | |
712 pxor_r2r (mm3, mm6); | |
713 pavg_r2r (mm1, mm0); | |
714 pavg_r2r (mm3, mm2); | |
715 por_r2r (mm6, mm7); | |
716 movq_r2r (mm0, mm6); | |
717 pxor_r2r (mm2, mm6); | |
718 pand_r2r (mm6, mm7); | |
719 pand_m2r (mask_one, mm7); | |
720 pavg_r2r (mm2, mm0); | |
721 psubusb_r2r (mm7, mm0); | |
722 ref += stride; | |
723 movq_r2m (mm0, *(dest+8)); | |
724 dest += stride; | |
725 } while (--height); | |
726 } | |
727 | |
9852 | 728 static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref, |
729 const int stride, const int cpu) | |
1 | 730 { |
731 do { | |
732 movq_m2r (*ref, mm0); | |
733 movq_m2r (*(ref+stride+1), mm1); | |
734 movq_r2r (mm0, mm7); | |
735 movq_m2r (*(ref+1), mm2); | |
736 pxor_r2r (mm1, mm7); | |
737 movq_m2r (*(ref+stride), mm3); | |
738 movq_r2r (mm2, mm6); | |
739 pxor_r2r (mm3, mm6); | |
740 pavg_r2r (mm1, mm0); | |
741 pavg_r2r (mm3, mm2); | |
742 por_r2r (mm6, mm7); | |
743 movq_r2r (mm0, mm6); | |
744 pxor_r2r (mm2, mm6); | |
745 pand_r2r (mm6, mm7); | |
746 pand_m2r (mask_one, mm7); | |
747 pavg_r2r (mm2, mm0); | |
748 psubusb_r2r (mm7, mm0); | |
749 movq_m2r (*dest, mm1); | |
750 pavg_r2r (mm1, mm0); | |
751 ref += stride; | |
752 movq_r2m (mm0, *dest); | |
753 dest += stride; | |
754 } while (--height); | |
755 } | |
756 | |
9852 | 757 static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref, |
758 const int stride, const int cpu) | |
1 | 759 { |
760 do { | |
761 movq_m2r (*ref, mm0); | |
762 movq_m2r (*(ref+stride+1), mm1); | |
763 movq_r2r (mm0, mm7); | |
764 movq_m2r (*(ref+1), mm2); | |
765 pxor_r2r (mm1, mm7); | |
766 movq_m2r (*(ref+stride), mm3); | |
767 movq_r2r (mm2, mm6); | |
768 pxor_r2r (mm3, mm6); | |
769 pavg_r2r (mm1, mm0); | |
770 pavg_r2r (mm3, mm2); | |
771 por_r2r (mm6, mm7); | |
772 movq_r2r (mm0, mm6); | |
773 pxor_r2r (mm2, mm6); | |
774 pand_r2r (mm6, mm7); | |
775 pand_m2r (mask_one, mm7); | |
776 pavg_r2r (mm2, mm0); | |
777 psubusb_r2r (mm7, mm0); | |
778 movq_m2r (*dest, mm1); | |
779 pavg_r2r (mm1, mm0); | |
780 movq_r2m (mm0, *dest); | |
781 | |
782 movq_m2r (*(ref+8), mm0); | |
783 movq_m2r (*(ref+stride+9), mm1); | |
784 movq_r2r (mm0, mm7); | |
785 movq_m2r (*(ref+9), mm2); | |
786 pxor_r2r (mm1, mm7); | |
787 movq_m2r (*(ref+stride+8), mm3); | |
788 movq_r2r (mm2, mm6); | |
789 pxor_r2r (mm3, mm6); | |
790 pavg_r2r (mm1, mm0); | |
791 pavg_r2r (mm3, mm2); | |
792 por_r2r (mm6, mm7); | |
793 movq_r2r (mm0, mm6); | |
794 pxor_r2r (mm2, mm6); | |
795 pand_r2r (mm6, mm7); | |
796 pand_m2r (mask_one, mm7); | |
797 pavg_r2r (mm2, mm0); | |
798 psubusb_r2r (mm7, mm0); | |
799 movq_m2r (*(dest+8), mm1); | |
800 pavg_r2r (mm1, mm0); | |
801 ref += stride; | |
802 movq_r2m (mm0, *(dest+8)); | |
803 dest += stride; | |
804 } while (--height); | |
805 } | |
806 | |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
807 #if HAVE_MMX2 |
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
808 |
9852 | 809 static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref, |
810 int stride, int height) | |
1 | 811 { |
812 MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT); | |
813 } | |
814 | |
9852 | 815 static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref, |
816 int stride, int height) | |
1 | 817 { |
818 MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT); | |
819 } | |
820 | |
9852 | 821 static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref, |
822 int stride, int height) | |
1 | 823 { |
824 MC_put1_16 (height, dest, ref, stride); | |
825 } | |
826 | |
9852 | 827 static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref, |
828 int stride, int height) | |
1 | 829 { |
830 MC_put1_8 (height, dest, ref, stride); | |
831 } | |
832 | |
9852 | 833 static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref, |
834 int stride, int height) | |
1 | 835 { |
836 MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
837 } | |
838 | |
9852 | 839 static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref, |
840 int stride, int height) | |
1 | 841 { |
842 MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
843 } | |
844 | |
9852 | 845 static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref, |
846 int stride, int height) | |
1 | 847 { |
848 MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
849 } | |
850 | |
9852 | 851 static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref, |
852 int stride, int height) | |
1 | 853 { |
854 MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT); | |
855 } | |
856 | |
9852 | 857 static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref, |
858 int stride, int height) | |
1 | 859 { |
860 MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
861 } | |
862 | |
9852 | 863 static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref, |
864 int stride, int height) | |
1 | 865 { |
866 MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
867 } | |
868 | |
9852 | 869 static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref, |
870 int stride, int height) | |
1 | 871 { |
872 MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
873 } | |
874 | |
9852 | 875 static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref, |
876 int stride, int height) | |
1 | 877 { |
878 MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT); | |
879 } | |
880 | |
9852 | 881 static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref, |
882 int stride, int height) | |
1 | 883 { |
884 MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
885 } | |
886 | |
9852 | 887 static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref, |
888 int stride, int height) | |
1 | 889 { |
890 MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
891 } | |
892 | |
9852 | 893 static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref, |
894 int stride, int height) | |
1 | 895 { |
896 MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT); | |
897 } | |
898 | |
9852 | 899 static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref, |
900 int stride, int height) | |
1 | 901 { |
902 MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT); | |
903 } | |
904 | |
905 | |
9852 | 906 MPEG2_MC_EXTERN (mmxext) |
1 | 907 |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
908 #endif /* HAVE_MMX2 */ |
1 | 909 |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
910 #if HAVE_AMD3DNOW |
1 | 911 |
9852 | 912 static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref, |
913 int stride, int height) | |
1 | 914 { |
915 MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW); | |
916 } | |
917 | |
9852 | 918 static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref, |
919 int stride, int height) | |
1 | 920 { |
921 MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW); | |
922 } | |
923 | |
9852 | 924 static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref, |
925 int stride, int height) | |
1 | 926 { |
927 MC_put1_16 (height, dest, ref, stride); | |
928 } | |
929 | |
9852 | 930 static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref, |
931 int stride, int height) | |
1 | 932 { |
933 MC_put1_8 (height, dest, ref, stride); | |
934 } | |
935 | |
9852 | 936 static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 937 int stride, int height) |
938 { | |
939 MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
940 } | |
941 | |
9852 | 942 static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 943 int stride, int height) |
944 { | |
945 MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
946 } | |
947 | |
9852 | 948 static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 949 int stride, int height) |
950 { | |
951 MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW); | |
952 } | |
953 | |
9852 | 954 static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 955 int stride, int height) |
956 { | |
957 MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW); | |
958 } | |
959 | |
9852 | 960 static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 961 int stride, int height) |
962 { | |
963 MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
964 } | |
965 | |
9852 | 966 static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 967 int stride, int height) |
968 { | |
969 MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
970 } | |
971 | |
9852 | 972 static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 973 int stride, int height) |
974 { | |
975 MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW); | |
976 } | |
977 | |
9852 | 978 static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 979 int stride, int height) |
980 { | |
981 MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW); | |
982 } | |
983 | |
9852 | 984 static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 985 int stride, int height) |
986 { | |
987 MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW); | |
988 } | |
989 | |
9852 | 990 static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 991 int stride, int height) |
992 { | |
993 MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW); | |
994 } | |
995 | |
9852 | 996 static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 997 int stride, int height) |
998 { | |
999 MC_put4_16 (height, dest, ref, stride, CPU_3DNOW); | |
1000 } | |
1001 | |
9852 | 1002 static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref, |
1 | 1003 int stride, int height) |
1004 { | |
1005 MC_put4_8 (height, dest, ref, stride, CPU_3DNOW); | |
1006 } | |
1007 | |
1008 | |
9852 | 1009 MPEG2_MC_EXTERN (3dnow) |
1 | 1010 |
30255
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
1011 #endif /* HAVE_AMD3DNOW */ |
e41a2492e665
Avoid linking in assembler-optimized code that will never be used.
reimar
parents:
28290
diff
changeset
|
1012 |
1 | 1013 #endif |