comparison libmpeg2/motion_comp_mmx.c @ 36:846535ace7a2

libmpeg2-0.2.0 merge
author arpi_esp
date Sun, 04 Mar 2001 21:01:54 +0000
parents 3b5f5d1c5041
children 47984e3f54ce
comparison
equal deleted inserted replaced
35:25f148e9890a 36:846535ace7a2
1 /* 1 /*
2 * motion_comp_mmx.c 2 * motion_comp_mmx.c
3 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> 3 * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
4 * 4 *
5 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. 5 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
6 * 6 *
7 * mpeg2dec is free software; you can redistribute it and/or modify 7 * mpeg2dec is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by 8 * it under the terms of the GNU General Public License as published by
31 31
32 #define CPU_MMXEXT 0 32 #define CPU_MMXEXT 0
33 #define CPU_3DNOW 1 33 #define CPU_3DNOW 1
34 34
35 35
36 //MMX code - needs a rewrite 36 /* MMX code - needs a rewrite */
37 37
38 38
39 39
40 40
41 41
42 42
43 43
44 // some rounding constants 44 /* some rounding constants */
45 mmx_t round1 = {0x0001000100010001LL}; 45 mmx_t round1 = {0x0001000100010001LL};
46 mmx_t round4 = {0x0002000200020002LL}; 46 mmx_t round4 = {0x0002000200020002LL};
47 47
48 /* 48 /*
49 * This code should probably be compiled with loop unrolling 49 * This code should probably be compiled with loop unrolling
53 * unrolling will help 53 * unrolling will help
54 */ 54 */
55 55
56 static inline void mmx_zero_reg () 56 static inline void mmx_zero_reg ()
57 { 57 {
58 // load 0 into mm0 58 /* load 0 into mm0 */
59 pxor_r2r (mm0, mm0); 59 pxor_r2r (mm0, mm0);
60 } 60 }
61 61
62 static inline void mmx_average_2_U8 (uint8_t * dest, 62 static inline void mmx_average_2_U8 (uint8_t * dest,
63 uint8_t * src1, uint8_t * src2) 63 uint8_t * src1, uint8_t * src2)
64 { 64 {
65 // 65 /* *dest = (*src1 + *src2 + 1)/ 2; */
66 // *dest = (*src1 + *src2 + 1)/ 2;
67 //
68 66
69 movq_m2r (*src1, mm1); // load 8 src1 bytes 67 movq_m2r (*src1, mm1); // load 8 src1 bytes
70 movq_r2r (mm1, mm2); // copy 8 src1 bytes 68 movq_r2r (mm1, mm2); // copy 8 src1 bytes
71 69
72 movq_m2r (*src2, mm3); // load 8 src2 bytes 70 movq_m2r (*src2, mm3); // load 8 src2 bytes
91 } 89 }
92 90
93 static inline void mmx_interp_average_2_U8 (uint8_t * dest, 91 static inline void mmx_interp_average_2_U8 (uint8_t * dest,
94 uint8_t * src1, uint8_t * src2) 92 uint8_t * src1, uint8_t * src2)
95 { 93 {
96 // 94 /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */
97 // *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2;
98 //
99 95
100 movq_m2r (*dest, mm1); // load 8 dest bytes 96 movq_m2r (*dest, mm1); // load 8 dest bytes
101 movq_r2r (mm1, mm2); // copy 8 dest bytes 97 movq_r2r (mm1, mm2); // copy 8 dest bytes
102 98
103 movq_m2r (*src1, mm3); // load 8 src1 bytes 99 movq_m2r (*src1, mm3); // load 8 src1 bytes
137 133
138 static inline void mmx_average_4_U8 (uint8_t * dest, 134 static inline void mmx_average_4_U8 (uint8_t * dest,
139 uint8_t * src1, uint8_t * src2, 135 uint8_t * src1, uint8_t * src2,
140 uint8_t * src3, uint8_t * src4) 136 uint8_t * src3, uint8_t * src4)
141 { 137 {
142 // 138 /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */
143 // *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4;
144 //
145 139
146 movq_m2r (*src1, mm1); // load 8 src1 bytes 140 movq_m2r (*src1, mm1); // load 8 src1 bytes
147 movq_r2r (mm1, mm2); // copy 8 src1 bytes 141 movq_r2r (mm1, mm2); // copy 8 src1 bytes
148 142
149 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes 143 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
156 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes 150 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
157 151
158 paddw_r2r (mm3, mm1); // add lows 152 paddw_r2r (mm3, mm1); // add lows
159 paddw_r2r (mm4, mm2); // add highs 153 paddw_r2r (mm4, mm2); // add highs
160 154
161 // now have partials in mm1 and mm2 155 /* now have partials in mm1 and mm2 */
162 156
163 movq_m2r (*src3, mm3); // load 8 src3 bytes 157 movq_m2r (*src3, mm3); // load 8 src3 bytes
164 movq_r2r (mm3, mm4); // copy 8 src3 bytes 158 movq_r2r (mm3, mm4); // copy 8 src3 bytes
165 159
166 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes 160 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
176 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes 170 punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes
177 171
178 paddw_r2r (mm5, mm1); // add lows 172 paddw_r2r (mm5, mm1); // add lows
179 paddw_r2r (mm6, mm2); // add highs 173 paddw_r2r (mm6, mm2); // add highs
180 174
181 // now have subtotal in mm1 and mm2 175 /* now have subtotal in mm1 and mm2 */
182 176
183 paddw_m2r (round4, mm1); 177 paddw_m2r (round4, mm1);
184 psraw_i2r (2, mm1); // /4 178 psraw_i2r (2, mm1); // /4
185 paddw_m2r (round4, mm2); 179 paddw_m2r (round4, mm2);
186 psraw_i2r (2, mm2); // /4 180 psraw_i2r (2, mm2); // /4
191 185
192 static inline void mmx_interp_average_4_U8 (uint8_t * dest, 186 static inline void mmx_interp_average_4_U8 (uint8_t * dest,
193 uint8_t * src1, uint8_t * src2, 187 uint8_t * src1, uint8_t * src2,
194 uint8_t * src3, uint8_t * src4) 188 uint8_t * src3, uint8_t * src4)
195 { 189 {
196 // 190 /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */
197 // *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2;
198 //
199 191
200 movq_m2r (*src1, mm1); // load 8 src1 bytes 192 movq_m2r (*src1, mm1); // load 8 src1 bytes
201 movq_r2r (mm1, mm2); // copy 8 src1 bytes 193 movq_r2r (mm1, mm2); // copy 8 src1 bytes
202 194
203 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes 195 punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes
210 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes 202 punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes
211 203
212 paddw_r2r (mm3, mm1); // add lows 204 paddw_r2r (mm3, mm1); // add lows
213 paddw_r2r (mm4, mm2); // add highs 205 paddw_r2r (mm4, mm2); // add highs
214 206
215 // now have partials in mm1 and mm2 207 /* now have partials in mm1 and mm2 */
216 208
217 movq_m2r (*src3, mm3); // load 8 src3 bytes 209 movq_m2r (*src3, mm3); // load 8 src3 bytes
218 movq_r2r (mm3, mm4); // copy 8 src3 bytes 210 movq_r2r (mm3, mm4); // copy 8 src3 bytes
219 211
220 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes 212 punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes
235 paddw_m2r (round4, mm1); 227 paddw_m2r (round4, mm1);
236 psraw_i2r (2, mm1); // /4 228 psraw_i2r (2, mm1); // /4
237 paddw_m2r (round4, mm2); 229 paddw_m2r (round4, mm2);
238 psraw_i2r (2, mm2); // /4 230 psraw_i2r (2, mm2); // /4
239 231
240 // now have subtotal/4 in mm1 and mm2 232 /* now have subtotal/4 in mm1 and mm2 */
241 233
242 movq_m2r (*dest, mm3); // load 8 dest bytes 234 movq_m2r (*dest, mm3); // load 8 dest bytes
243 movq_r2r (mm3, mm4); // copy 8 dest bytes 235 movq_r2r (mm3, mm4); // copy 8 dest bytes
244 236
245 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes 237 punpcklbw_r2r (mm0, mm3); // unpack low dest bytes
251 paddw_m2r (round1, mm1); 243 paddw_m2r (round1, mm1);
252 psraw_i2r (1, mm1); // /2 244 psraw_i2r (1, mm1); // /2
253 paddw_m2r (round1, mm2); 245 paddw_m2r (round1, mm2);
254 psraw_i2r (1, mm2); // /2 246 psraw_i2r (1, mm2); // /2
255 247
256 // now have end value in mm1 and mm2 248 /* now have end value in mm1 and mm2 */
257 249
258 packuswb_r2r (mm2, mm1); // pack (w/ saturation) 250 packuswb_r2r (mm2, mm1); // pack (w/ saturation)
259 movq_r2m (mm1,*dest); // store result in dest 251 movq_r2m (mm1,*dest); // store result in dest
260 } 252 }
261 253
262 //----------------------------------------------------------------------- 254 /*-----------------------------------------------------------------------*/
263 255
264 static inline void MC_avg_mmx (int width, int height, 256 static inline void MC_avg_mmx (int width, int height,
265 uint8_t * dest, uint8_t * ref, int stride) 257 uint8_t * dest, uint8_t * ref, int stride)
266 { 258 {
267 mmx_zero_reg (); 259 mmx_zero_reg ();
287 int stride, int height) 279 int stride, int height)
288 { 280 {
289 MC_avg_mmx (8, height, dest, ref, stride); 281 MC_avg_mmx (8, height, dest, ref, stride);
290 } 282 }
291 283
292 //----------------------------------------------------------------------- 284 /*-----------------------------------------------------------------------*/
293 285
294 static inline void MC_put_mmx (int width, int height, 286 static inline void MC_put_mmx (int width, int height,
295 uint8_t * dest, uint8_t * ref, int stride) 287 uint8_t * dest, uint8_t * ref, int stride)
296 { 288 {
297 mmx_zero_reg (); 289 mmx_zero_reg ();
321 int stride, int height) 313 int stride, int height)
322 { 314 {
323 MC_put_mmx (8, height, dest, ref, stride); 315 MC_put_mmx (8, height, dest, ref, stride);
324 } 316 }
325 317
326 //----------------------------------------------------------------------- 318 /*-----------------------------------------------------------------------*/
327 319
328 // Half pixel interpolation in the x direction 320 /* Half pixel interpolation in the x direction */
329 static inline void MC_avg_x_mmx (int width, int height, 321 static inline void MC_avg_x_mmx (int width, int height,
330 uint8_t * dest, uint8_t * ref, int stride) 322 uint8_t * dest, uint8_t * ref, int stride)
331 { 323 {
332 mmx_zero_reg (); 324 mmx_zero_reg ();
333 325
352 int stride, int height) 344 int stride, int height)
353 { 345 {
354 MC_avg_x_mmx (8, height, dest, ref, stride); 346 MC_avg_x_mmx (8, height, dest, ref, stride);
355 } 347 }
356 348
357 //----------------------------------------------------------------------- 349 /*-----------------------------------------------------------------------*/
358 350
359 static inline void MC_put_x_mmx (int width, int height, 351 static inline void MC_put_x_mmx (int width, int height,
360 uint8_t * dest, uint8_t * ref, int stride) 352 uint8_t * dest, uint8_t * ref, int stride)
361 { 353 {
362 mmx_zero_reg (); 354 mmx_zero_reg ();
382 int stride, int height) 374 int stride, int height)
383 { 375 {
384 MC_put_x_mmx (8, height, dest, ref, stride); 376 MC_put_x_mmx (8, height, dest, ref, stride);
385 } 377 }
386 378
387 //----------------------------------------------------------------------- 379 /*-----------------------------------------------------------------------*/
388 380
389 static inline void MC_avg_xy_mmx (int width, int height, 381 static inline void MC_avg_xy_mmx (int width, int height,
390 uint8_t * dest, uint8_t * ref, int stride) 382 uint8_t * dest, uint8_t * ref, int stride)
391 { 383 {
392 uint8_t * ref_next = ref+stride; 384 uint8_t * ref_next = ref+stride;
416 int stride, int height) 408 int stride, int height)
417 { 409 {
418 MC_avg_xy_mmx (8, height, dest, ref, stride); 410 MC_avg_xy_mmx (8, height, dest, ref, stride);
419 } 411 }
420 412
421 //----------------------------------------------------------------------- 413 /*-----------------------------------------------------------------------*/
422 414
423 static inline void MC_put_xy_mmx (int width, int height, 415 static inline void MC_put_xy_mmx (int width, int height,
424 uint8_t * dest, uint8_t * ref, int stride) 416 uint8_t * dest, uint8_t * ref, int stride)
425 { 417 {
426 uint8_t * ref_next = ref+stride; 418 uint8_t * ref_next = ref+stride;
449 int stride, int height) 441 int stride, int height)
450 { 442 {
451 MC_put_xy_mmx (8, height, dest, ref, stride); 443 MC_put_xy_mmx (8, height, dest, ref, stride);
452 } 444 }
453 445
454 //----------------------------------------------------------------------- 446 /*-----------------------------------------------------------------------*/
455 447
456 static inline void MC_avg_y_mmx (int width, int height, 448 static inline void MC_avg_y_mmx (int width, int height,
457 uint8_t * dest, uint8_t * ref, int stride) 449 uint8_t * dest, uint8_t * ref, int stride)
458 { 450 {
459 uint8_t * ref_next = ref+stride; 451 uint8_t * ref_next = ref+stride;
482 int stride, int height) 474 int stride, int height)
483 { 475 {
484 MC_avg_y_mmx (8, height, dest, ref, stride); 476 MC_avg_y_mmx (8, height, dest, ref, stride);
485 } 477 }
486 478
487 //----------------------------------------------------------------------- 479 /*-----------------------------------------------------------------------*/
488 480
489 static inline void MC_put_y_mmx (int width, int height, 481 static inline void MC_put_y_mmx (int width, int height,
490 uint8_t * dest, uint8_t * ref, int stride) 482 uint8_t * dest, uint8_t * ref, int stride)
491 { 483 {
492 uint8_t * ref_next = ref+stride; 484 uint8_t * ref_next = ref+stride;
524 516
525 517
526 518
527 519
528 520
529 //CPU_MMXEXT/CPU_3DNOW adaptation layer 521 /* CPU_MMXEXT/CPU_3DNOW adaptation layer */
530 522
531 #define pavg_r2r(src,dest) \ 523 #define pavg_r2r(src,dest) \
532 do { \ 524 do { \
533 if (cpu == CPU_MMXEXT) \ 525 if (cpu == CPU_MMXEXT) \
534 pavgb_r2r (src, dest); \ 526 pavgb_r2r (src, dest); \
543 else \ 535 else \
544 pavgusb_m2r (src, dest); \ 536 pavgusb_m2r (src, dest); \
545 } while (0) 537 } while (0)
546 538
547 539
548 //CPU_MMXEXT code 540 /* CPU_MMXEXT code */
549 541
550 542
551 static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref, 543 static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref,
552 int stride) 544 int stride)
553 { 545 {