Mercurial > mplayer.hg
comparison libmpeg2/motion_comp_mmx.c @ 36:846535ace7a2
libmpeg2-0.2.0 merge
author | arpi_esp |
---|---|
date | Sun, 04 Mar 2001 21:01:54 +0000 |
parents | 3b5f5d1c5041 |
children | 47984e3f54ce |
comparison
equal
deleted
inserted
replaced
35:25f148e9890a | 36:846535ace7a2 |
---|---|
1 /* | 1 /* |
2 * motion_comp_mmx.c | 2 * motion_comp_mmx.c |
3 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> | 3 * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> |
4 * | 4 * |
5 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. | 5 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. |
6 * | 6 * |
7 * mpeg2dec is free software; you can redistribute it and/or modify | 7 * mpeg2dec is free software; you can redistribute it and/or modify |
8 * it under the terms of the GNU General Public License as published by | 8 * it under the terms of the GNU General Public License as published by |
31 | 31 |
32 #define CPU_MMXEXT 0 | 32 #define CPU_MMXEXT 0 |
33 #define CPU_3DNOW 1 | 33 #define CPU_3DNOW 1 |
34 | 34 |
35 | 35 |
36 //MMX code - needs a rewrite | 36 /* MMX code - needs a rewrite */ |
37 | 37 |
38 | 38 |
39 | 39 |
40 | 40 |
41 | 41 |
42 | 42 |
43 | 43 |
44 // some rounding constants | 44 /* some rounding constants */ |
45 mmx_t round1 = {0x0001000100010001LL}; | 45 mmx_t round1 = {0x0001000100010001LL}; |
46 mmx_t round4 = {0x0002000200020002LL}; | 46 mmx_t round4 = {0x0002000200020002LL}; |
47 | 47 |
48 /* | 48 /* |
49 * This code should probably be compiled with loop unrolling | 49 * This code should probably be compiled with loop unrolling |
53 * unrolling will help | 53 * unrolling will help |
54 */ | 54 */ |
55 | 55 |
56 static inline void mmx_zero_reg () | 56 static inline void mmx_zero_reg () |
57 { | 57 { |
58 // load 0 into mm0 | 58 /* load 0 into mm0 */ |
59 pxor_r2r (mm0, mm0); | 59 pxor_r2r (mm0, mm0); |
60 } | 60 } |
61 | 61 |
62 static inline void mmx_average_2_U8 (uint8_t * dest, | 62 static inline void mmx_average_2_U8 (uint8_t * dest, |
63 uint8_t * src1, uint8_t * src2) | 63 uint8_t * src1, uint8_t * src2) |
64 { | 64 { |
65 // | 65 /* *dest = (*src1 + *src2 + 1)/ 2; */ |
66 // *dest = (*src1 + *src2 + 1)/ 2; | |
67 // | |
68 | 66 |
69 movq_m2r (*src1, mm1); // load 8 src1 bytes | 67 movq_m2r (*src1, mm1); // load 8 src1 bytes |
70 movq_r2r (mm1, mm2); // copy 8 src1 bytes | 68 movq_r2r (mm1, mm2); // copy 8 src1 bytes |
71 | 69 |
72 movq_m2r (*src2, mm3); // load 8 src2 bytes | 70 movq_m2r (*src2, mm3); // load 8 src2 bytes |
91 } | 89 } |
92 | 90 |
93 static inline void mmx_interp_average_2_U8 (uint8_t * dest, | 91 static inline void mmx_interp_average_2_U8 (uint8_t * dest, |
94 uint8_t * src1, uint8_t * src2) | 92 uint8_t * src1, uint8_t * src2) |
95 { | 93 { |
96 // | 94 /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ |
97 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; | |
98 // | |
99 | 95 |
100 movq_m2r (*dest, mm1); // load 8 dest bytes | 96 movq_m2r (*dest, mm1); // load 8 dest bytes |
101 movq_r2r (mm1, mm2); // copy 8 dest bytes | 97 movq_r2r (mm1, mm2); // copy 8 dest bytes |
102 | 98 |
103 movq_m2r (*src1, mm3); // load 8 src1 bytes | 99 movq_m2r (*src1, mm3); // load 8 src1 bytes |
137 | 133 |
138 static inline void mmx_average_4_U8 (uint8_t * dest, | 134 static inline void mmx_average_4_U8 (uint8_t * dest, |
139 uint8_t * src1, uint8_t * src2, | 135 uint8_t * src1, uint8_t * src2, |
140 uint8_t * src3, uint8_t * src4) | 136 uint8_t * src3, uint8_t * src4) |
141 { | 137 { |
142 // | 138 /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */ |
143 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; | |
144 // | |
145 | 139 |
146 movq_m2r (*src1, mm1); // load 8 src1 bytes | 140 movq_m2r (*src1, mm1); // load 8 src1 bytes |
147 movq_r2r (mm1, mm2); // copy 8 src1 bytes | 141 movq_r2r (mm1, mm2); // copy 8 src1 bytes |
148 | 142 |
149 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes | 143 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes |
156 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes | 150 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes |
157 | 151 |
158 paddw_r2r (mm3, mm1); // add lows | 152 paddw_r2r (mm3, mm1); // add lows |
159 paddw_r2r (mm4, mm2); // add highs | 153 paddw_r2r (mm4, mm2); // add highs |
160 | 154 |
161 // now have partials in mm1 and mm2 | 155 /* now have partials in mm1 and mm2 */ |
162 | 156 |
163 movq_m2r (*src3, mm3); // load 8 src3 bytes | 157 movq_m2r (*src3, mm3); // load 8 src3 bytes |
164 movq_r2r (mm3, mm4); // copy 8 src3 bytes | 158 movq_r2r (mm3, mm4); // copy 8 src3 bytes |
165 | 159 |
166 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes | 160 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes |
176 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes | 170 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes |
177 | 171 |
178 paddw_r2r (mm5, mm1); // add lows | 172 paddw_r2r (mm5, mm1); // add lows |
179 paddw_r2r (mm6, mm2); // add highs | 173 paddw_r2r (mm6, mm2); // add highs |
180 | 174 |
181 // now have subtotal in mm1 and mm2 | 175 /* now have subtotal in mm1 and mm2 */ |
182 | 176 |
183 paddw_m2r (round4, mm1); | 177 paddw_m2r (round4, mm1); |
184 psraw_i2r (2, mm1); // /4 | 178 psraw_i2r (2, mm1); // /4 |
185 paddw_m2r (round4, mm2); | 179 paddw_m2r (round4, mm2); |
186 psraw_i2r (2, mm2); // /4 | 180 psraw_i2r (2, mm2); // /4 |
191 | 185 |
192 static inline void mmx_interp_average_4_U8 (uint8_t * dest, | 186 static inline void mmx_interp_average_4_U8 (uint8_t * dest, |
193 uint8_t * src1, uint8_t * src2, | 187 uint8_t * src1, uint8_t * src2, |
194 uint8_t * src3, uint8_t * src4) | 188 uint8_t * src3, uint8_t * src4) |
195 { | 189 { |
196 // | 190 /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */ |
197 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; | |
198 // | |
199 | 191 |
200 movq_m2r (*src1, mm1); // load 8 src1 bytes | 192 movq_m2r (*src1, mm1); // load 8 src1 bytes |
201 movq_r2r (mm1, mm2); // copy 8 src1 bytes | 193 movq_r2r (mm1, mm2); // copy 8 src1 bytes |
202 | 194 |
203 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes | 195 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes |
210 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes | 202 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes |
211 | 203 |
212 paddw_r2r (mm3, mm1); // add lows | 204 paddw_r2r (mm3, mm1); // add lows |
213 paddw_r2r (mm4, mm2); // add highs | 205 paddw_r2r (mm4, mm2); // add highs |
214 | 206 |
215 // now have partials in mm1 and mm2 | 207 /* now have partials in mm1 and mm2 */ |
216 | 208 |
217 movq_m2r (*src3, mm3); // load 8 src3 bytes | 209 movq_m2r (*src3, mm3); // load 8 src3 bytes |
218 movq_r2r (mm3, mm4); // copy 8 src3 bytes | 210 movq_r2r (mm3, mm4); // copy 8 src3 bytes |
219 | 211 |
220 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes | 212 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes |
235 paddw_m2r (round4, mm1); | 227 paddw_m2r (round4, mm1); |
236 psraw_i2r (2, mm1); // /4 | 228 psraw_i2r (2, mm1); // /4 |
237 paddw_m2r (round4, mm2); | 229 paddw_m2r (round4, mm2); |
238 psraw_i2r (2, mm2); // /4 | 230 psraw_i2r (2, mm2); // /4 |
239 | 231 |
240 // now have subtotal/4 in mm1 and mm2 | 232 /* now have subtotal/4 in mm1 and mm2 */ |
241 | 233 |
242 movq_m2r (*dest, mm3); // load 8 dest bytes | 234 movq_m2r (*dest, mm3); // load 8 dest bytes |
243 movq_r2r (mm3, mm4); // copy 8 dest bytes | 235 movq_r2r (mm3, mm4); // copy 8 dest bytes |
244 | 236 |
245 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes | 237 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes |
251 paddw_m2r (round1, mm1); | 243 paddw_m2r (round1, mm1); |
252 psraw_i2r (1, mm1); // /2 | 244 psraw_i2r (1, mm1); // /2 |
253 paddw_m2r (round1, mm2); | 245 paddw_m2r (round1, mm2); |
254 psraw_i2r (1, mm2); // /2 | 246 psraw_i2r (1, mm2); // /2 |
255 | 247 |
256 // now have end value in mm1 and mm2 | 248 /* now have end value in mm1 and mm2 */ |
257 | 249 |
258 packuswb_r2r (mm2, mm1); // pack (w/ saturation) | 250 packuswb_r2r (mm2, mm1); // pack (w/ saturation) |
259 movq_r2m (mm1,*dest); // store result in dest | 251 movq_r2m (mm1,*dest); // store result in dest |
260 } | 252 } |
261 | 253 |
262 //----------------------------------------------------------------------- | 254 /*-----------------------------------------------------------------------*/ |
263 | 255 |
264 static inline void MC_avg_mmx (int width, int height, | 256 static inline void MC_avg_mmx (int width, int height, |
265 uint8_t * dest, uint8_t * ref, int stride) | 257 uint8_t * dest, uint8_t * ref, int stride) |
266 { | 258 { |
267 mmx_zero_reg (); | 259 mmx_zero_reg (); |
287 int stride, int height) | 279 int stride, int height) |
288 { | 280 { |
289 MC_avg_mmx (8, height, dest, ref, stride); | 281 MC_avg_mmx (8, height, dest, ref, stride); |
290 } | 282 } |
291 | 283 |
292 //----------------------------------------------------------------------- | 284 /*-----------------------------------------------------------------------*/ |
293 | 285 |
294 static inline void MC_put_mmx (int width, int height, | 286 static inline void MC_put_mmx (int width, int height, |
295 uint8_t * dest, uint8_t * ref, int stride) | 287 uint8_t * dest, uint8_t * ref, int stride) |
296 { | 288 { |
297 mmx_zero_reg (); | 289 mmx_zero_reg (); |
321 int stride, int height) | 313 int stride, int height) |
322 { | 314 { |
323 MC_put_mmx (8, height, dest, ref, stride); | 315 MC_put_mmx (8, height, dest, ref, stride); |
324 } | 316 } |
325 | 317 |
326 //----------------------------------------------------------------------- | 318 /*-----------------------------------------------------------------------*/ |
327 | 319 |
328 // Half pixel interpolation in the x direction | 320 /* Half pixel interpolation in the x direction */ |
329 static inline void MC_avg_x_mmx (int width, int height, | 321 static inline void MC_avg_x_mmx (int width, int height, |
330 uint8_t * dest, uint8_t * ref, int stride) | 322 uint8_t * dest, uint8_t * ref, int stride) |
331 { | 323 { |
332 mmx_zero_reg (); | 324 mmx_zero_reg (); |
333 | 325 |
352 int stride, int height) | 344 int stride, int height) |
353 { | 345 { |
354 MC_avg_x_mmx (8, height, dest, ref, stride); | 346 MC_avg_x_mmx (8, height, dest, ref, stride); |
355 } | 347 } |
356 | 348 |
357 //----------------------------------------------------------------------- | 349 /*-----------------------------------------------------------------------*/ |
358 | 350 |
359 static inline void MC_put_x_mmx (int width, int height, | 351 static inline void MC_put_x_mmx (int width, int height, |
360 uint8_t * dest, uint8_t * ref, int stride) | 352 uint8_t * dest, uint8_t * ref, int stride) |
361 { | 353 { |
362 mmx_zero_reg (); | 354 mmx_zero_reg (); |
382 int stride, int height) | 374 int stride, int height) |
383 { | 375 { |
384 MC_put_x_mmx (8, height, dest, ref, stride); | 376 MC_put_x_mmx (8, height, dest, ref, stride); |
385 } | 377 } |
386 | 378 |
387 //----------------------------------------------------------------------- | 379 /*-----------------------------------------------------------------------*/ |
388 | 380 |
389 static inline void MC_avg_xy_mmx (int width, int height, | 381 static inline void MC_avg_xy_mmx (int width, int height, |
390 uint8_t * dest, uint8_t * ref, int stride) | 382 uint8_t * dest, uint8_t * ref, int stride) |
391 { | 383 { |
392 uint8_t * ref_next = ref+stride; | 384 uint8_t * ref_next = ref+stride; |
416 int stride, int height) | 408 int stride, int height) |
417 { | 409 { |
418 MC_avg_xy_mmx (8, height, dest, ref, stride); | 410 MC_avg_xy_mmx (8, height, dest, ref, stride); |
419 } | 411 } |
420 | 412 |
421 //----------------------------------------------------------------------- | 413 /*-----------------------------------------------------------------------*/ |
422 | 414 |
423 static inline void MC_put_xy_mmx (int width, int height, | 415 static inline void MC_put_xy_mmx (int width, int height, |
424 uint8_t * dest, uint8_t * ref, int stride) | 416 uint8_t * dest, uint8_t * ref, int stride) |
425 { | 417 { |
426 uint8_t * ref_next = ref+stride; | 418 uint8_t * ref_next = ref+stride; |
449 int stride, int height) | 441 int stride, int height) |
450 { | 442 { |
451 MC_put_xy_mmx (8, height, dest, ref, stride); | 443 MC_put_xy_mmx (8, height, dest, ref, stride); |
452 } | 444 } |
453 | 445 |
454 //----------------------------------------------------------------------- | 446 /*-----------------------------------------------------------------------*/ |
455 | 447 |
456 static inline void MC_avg_y_mmx (int width, int height, | 448 static inline void MC_avg_y_mmx (int width, int height, |
457 uint8_t * dest, uint8_t * ref, int stride) | 449 uint8_t * dest, uint8_t * ref, int stride) |
458 { | 450 { |
459 uint8_t * ref_next = ref+stride; | 451 uint8_t * ref_next = ref+stride; |
482 int stride, int height) | 474 int stride, int height) |
483 { | 475 { |
484 MC_avg_y_mmx (8, height, dest, ref, stride); | 476 MC_avg_y_mmx (8, height, dest, ref, stride); |
485 } | 477 } |
486 | 478 |
487 //----------------------------------------------------------------------- | 479 /*-----------------------------------------------------------------------*/ |
488 | 480 |
489 static inline void MC_put_y_mmx (int width, int height, | 481 static inline void MC_put_y_mmx (int width, int height, |
490 uint8_t * dest, uint8_t * ref, int stride) | 482 uint8_t * dest, uint8_t * ref, int stride) |
491 { | 483 { |
492 uint8_t * ref_next = ref+stride; | 484 uint8_t * ref_next = ref+stride; |
524 | 516 |
525 | 517 |
526 | 518 |
527 | 519 |
528 | 520 |
529 //CPU_MMXEXT/CPU_3DNOW adaptation layer | 521 /* CPU_MMXEXT/CPU_3DNOW adaptation layer */ |
530 | 522 |
531 #define pavg_r2r(src,dest) \ | 523 #define pavg_r2r(src,dest) \ |
532 do { \ | 524 do { \ |
533 if (cpu == CPU_MMXEXT) \ | 525 if (cpu == CPU_MMXEXT) \ |
534 pavgb_r2r (src, dest); \ | 526 pavgb_r2r (src, dest); \ |
543 else \ | 535 else \ |
544 pavgusb_m2r (src, dest); \ | 536 pavgusb_m2r (src, dest); \ |
545 } while (0) | 537 } while (0) |
546 | 538 |
547 | 539 |
548 //CPU_MMXEXT code | 540 /* CPU_MMXEXT code */ |
549 | 541 |
550 | 542 |
551 static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref, | 543 static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref, |
552 int stride) | 544 int stride) |
553 { | 545 { |