comparison i386/dsputil_mmx_rnd.h @ 984:e162c09efbe7 libavcodec

qpel fix
author michaelni
date Thu, 09 Jan 2003 20:42:37 +0000
parents 274b518c4ecb
children 42fdf7b24d2e
comparison
equal deleted inserted replaced
983:ca2a303ea039 984:e162c09efbe7
56 56
57 static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 57 static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
58 { 58 {
59 MOVQ_BFE(mm6); 59 MOVQ_BFE(mm6);
60 __asm __volatile( 60 __asm __volatile(
61 "test $1, %0 \n\t"
62 " jz 1f \n\t"
63 "movq (%1), %%mm0 \n\t"
64 "movq (%2), %%mm1 \n\t"
65 "addl %4, %1 \n\t"
66 "addl $8, %2 \n\t"
67 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6)
68 "movq %%mm4, (%3) \n\t"
69 "addl %5, %3 \n\t"
70 "decl %0 \n\t"
61 ".balign 8 \n\t" 71 ".balign 8 \n\t"
62 "1: \n\t" 72 "1: \n\t"
63 "movq (%1), %%mm0 \n\t" 73 "movq (%1), %%mm0 \n\t"
64 "movq (%2), %%mm1 \n\t" 74 "movq (%2), %%mm1 \n\t"
65 "addl %4, %1 \n\t" 75 "addl %4, %1 \n\t"
142 152
143 static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) 153 static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
144 { 154 {
145 MOVQ_BFE(mm6); 155 MOVQ_BFE(mm6);
146 __asm __volatile( 156 __asm __volatile(
157 "test $1, %0 \n\t"
158 " jz 1f \n\t"
159 "movq (%1), %%mm0 \n\t"
160 "movq (%2), %%mm1 \n\t"
161 "movq 8(%1), %%mm2 \n\t"
162 "movq 8(%2), %%mm3 \n\t"
163 "addl %4, %1 \n\t"
164 "addl $16, %2 \n\t"
165 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
166 "movq %%mm4, (%3) \n\t"
167 "movq %%mm5, 8(%3) \n\t"
168 "addl %5, %3 \n\t"
169 "decl %0 \n\t"
147 ".balign 8 \n\t" 170 ".balign 8 \n\t"
148 "1: \n\t" 171 "1: \n\t"
149 "movq (%1), %%mm0 \n\t" 172 "movq (%1), %%mm0 \n\t"
150 "movq (%2), %%mm1 \n\t" 173 "movq (%2), %%mm1 \n\t"
151 "movq 8(%1), %%mm2 \n\t" 174 "movq 8(%1), %%mm2 \n\t"
267 "subl $2, %0 \n\t" 290 "subl $2, %0 \n\t"
268 "jnz 1b \n\t" 291 "jnz 1b \n\t"
269 :"+g"(h), "+S"(pixels) 292 :"+g"(h), "+S"(pixels)
270 :"D"(block), "r"(line_size) 293 :"D"(block), "r"(line_size)
271 :"eax", "memory"); 294 :"eax", "memory");
272 }
273
274 static void DEF(put, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
275 {
276 MOVQ_ZERO(mm7);
277 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
278 __asm __volatile(
279 ".balign 8 \n\t"
280 "1: \n\t"
281 "movq (%1), %%mm0 \n\t"
282 "movq (%2), %%mm1 \n\t"
283 "movq 64(%2), %%mm2 \n\t"
284 "movq 136(%2), %%mm3 \n\t"
285 "punpcklbw %%mm7, %%mm0 \n\t"
286 "punpcklbw %%mm7, %%mm1 \n\t"
287 "punpcklbw %%mm7, %%mm2 \n\t"
288 "punpcklbw %%mm7, %%mm3 \n\t"
289 "paddusw %%mm6, %%mm0 \n\t"
290 "paddusw %%mm0, %%mm1 \n\t"
291 "paddusw %%mm2, %%mm3 \n\t"
292 "paddusw %%mm1, %%mm3 \n\t"
293 "psrlw $2, %%mm3 \n\t"
294 "movq (%1), %%mm0 \n\t"
295 "movq (%2), %%mm1 \n\t"
296 "movq 64(%2), %%mm2 \n\t"
297 "movq 136(%2), %%mm4 \n\t"
298 "punpckhbw %%mm7, %%mm0 \n\t"
299 "punpckhbw %%mm7, %%mm1 \n\t"
300 "punpckhbw %%mm7, %%mm2 \n\t"
301 "punpckhbw %%mm7, %%mm4 \n\t"
302 "paddusw %%mm6, %%mm0 \n\t"
303 "paddusw %%mm0, %%mm1 \n\t"
304 "paddusw %%mm2, %%mm4 \n\t"
305 "paddusw %%mm1, %%mm4 \n\t"
306 "psrlw $2, %%mm4 \n\t"
307 "packuswb %%mm4, %%mm3 \n\t"
308 "movq %%mm3, (%0) \n\t"
309 "addl %4, %0 \n\t"
310 "addl %4, %1 \n\t"
311 "addl $8, %2 \n\t"
312 "decl %3 \n\t"
313 "jnz 1b \n\t"
314 :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
315 :"r"(stride)
316 :"memory");
317 }
318
319 static void DEF(put, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
320 {
321 MOVQ_ZERO(mm7);
322 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
323 __asm __volatile(
324 ".balign 8 \n\t"
325 "1: \n\t"
326 "movq (%1), %%mm0 \n\t"
327 "movq (%2), %%mm1 \n\t"
328 "movq 256(%2), %%mm2 \n\t"
329 "movq 528(%2), %%mm3 \n\t"
330 "punpcklbw %%mm7, %%mm0 \n\t"
331 "punpcklbw %%mm7, %%mm1 \n\t"
332 "punpcklbw %%mm7, %%mm2 \n\t"
333 "punpcklbw %%mm7, %%mm3 \n\t"
334 "paddusw %%mm6, %%mm0 \n\t"
335 "paddusw %%mm0, %%mm1 \n\t"
336 "paddusw %%mm2, %%mm3 \n\t"
337 "paddusw %%mm1, %%mm3 \n\t"
338 "psrlw $2, %%mm3 \n\t"
339 "movq (%1), %%mm0 \n\t"
340 "movq (%2), %%mm1 \n\t"
341 "movq 256(%2), %%mm2 \n\t"
342 "movq 528(%2), %%mm4 \n\t"
343 "punpckhbw %%mm7, %%mm0 \n\t"
344 "punpckhbw %%mm7, %%mm1 \n\t"
345 "punpckhbw %%mm7, %%mm2 \n\t"
346 "punpckhbw %%mm7, %%mm4 \n\t"
347 "paddusw %%mm6, %%mm0 \n\t"
348 "paddusw %%mm0, %%mm1 \n\t"
349 "paddusw %%mm2, %%mm4 \n\t"
350 "paddusw %%mm1, %%mm4 \n\t"
351 "psrlw $2, %%mm4 \n\t"
352 "packuswb %%mm4, %%mm3 \n\t"
353 "movq %%mm3, (%0) \n\t"
354 "movq 8(%1), %%mm0 \n\t"
355 "movq 8(%2), %%mm1 \n\t"
356 "movq 264(%2), %%mm2 \n\t"
357 "movq 536(%2), %%mm3 \n\t"
358 "punpcklbw %%mm7, %%mm0 \n\t"
359 "punpcklbw %%mm7, %%mm1 \n\t"
360 "punpcklbw %%mm7, %%mm2 \n\t"
361 "punpcklbw %%mm7, %%mm3 \n\t"
362 "paddusw %%mm6, %%mm0 \n\t"
363 "paddusw %%mm0, %%mm1 \n\t"
364 "paddusw %%mm2, %%mm3 \n\t"
365 "paddusw %%mm1, %%mm3 \n\t"
366 "psrlw $2, %%mm3 \n\t"
367 "movq 8(%1), %%mm0 \n\t"
368 "movq 8(%2), %%mm1 \n\t"
369 "movq 264(%2), %%mm2 \n\t"
370 "movq 536(%2), %%mm4 \n\t"
371 "punpckhbw %%mm7, %%mm0 \n\t"
372 "punpckhbw %%mm7, %%mm1 \n\t"
373 "punpckhbw %%mm7, %%mm2 \n\t"
374 "punpckhbw %%mm7, %%mm4 \n\t"
375 "paddusw %%mm6, %%mm0 \n\t"
376 "paddusw %%mm0, %%mm1 \n\t"
377 "paddusw %%mm2, %%mm4 \n\t"
378 "paddusw %%mm1, %%mm4 \n\t"
379 "psrlw $2, %%mm4 \n\t"
380 "packuswb %%mm4, %%mm3 \n\t"
381 "movq %%mm3, 8(%0) \n\t"
382 "addl %4, %0 \n\t"
383 "addl %4, %1 \n\t"
384 "addl $16, %2 \n\t"
385 "decl %3 \n\t"
386 "jnz 1b \n\t"
387 :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
388 :"r"(stride)
389 :"memory");
390 } 295 }
391 296
392 // avg_pixels 297 // avg_pixels
393 // in case more speed is needed - unroling would certainly help 298 // in case more speed is needed - unroling would certainly help
394 static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) 299 static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
639 :"+g"(h), "+S"(pixels) 544 :"+g"(h), "+S"(pixels)
640 :"D"(block), "r"(line_size) 545 :"D"(block), "r"(line_size)
641 :"eax", "memory"); 546 :"eax", "memory");
642 } 547 }
643 548
644 static void DEF(avg, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
645 {
646 MOVQ_ZERO(mm7);
647 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
648 MOVQ_BFE(mm5);
649 __asm __volatile(
650 ".balign 8 \n\t"
651 "1: \n\t"
652 "movq (%1), %%mm0 \n\t"
653 "movq (%2), %%mm1 \n\t"
654 "movq 64(%2), %%mm2 \n\t"
655 "movq 136(%2), %%mm3 \n\t"
656 "punpcklbw %%mm7, %%mm0 \n\t"
657 "punpcklbw %%mm7, %%mm1 \n\t"
658 "punpcklbw %%mm7, %%mm2 \n\t"
659 "punpcklbw %%mm7, %%mm3 \n\t"
660 "paddusw %%mm6, %%mm0 \n\t"
661 "paddusw %%mm0, %%mm1 \n\t"
662 "paddusw %%mm2, %%mm3 \n\t"
663 "paddusw %%mm1, %%mm3 \n\t"
664 "psrlw $2, %%mm3 \n\t"
665 "movq (%1), %%mm0 \n\t"
666 "movq (%2), %%mm1 \n\t"
667 "movq 64(%2), %%mm2 \n\t"
668 "movq 136(%2), %%mm4 \n\t"
669 "punpckhbw %%mm7, %%mm0 \n\t"
670 "punpckhbw %%mm7, %%mm1 \n\t"
671 "punpckhbw %%mm7, %%mm2 \n\t"
672 "punpckhbw %%mm7, %%mm4 \n\t"
673 "paddusw %%mm6, %%mm0 \n\t"
674 "paddusw %%mm0, %%mm1 \n\t"
675 "paddusw %%mm2, %%mm4 \n\t"
676 "paddusw %%mm1, %%mm4 \n\t"
677 "psrlw $2, %%mm4 \n\t"
678 "packuswb %%mm4, %%mm3 \n\t"
679 "movq (%0), %%mm4 \n\t"
680 PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
681 "movq %%mm0, (%0) \n\t"
682 "addl %4, %0 \n\t"
683 "addl %4, %1 \n\t"
684 "addl $8, %2 \n\t"
685 "decl %3 \n\t"
686 "jnz 1b \n\t"
687 :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
688 :"r"(stride)
689 :"memory");
690 }
691
692 static void DEF(avg, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h)
693 {
694 MOVQ_ZERO(mm7);
695 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
696 MOVQ_BFE(mm5);
697 __asm __volatile(
698 ".balign 8 \n\t"
699 "1: \n\t"
700 "movq (%1), %%mm0 \n\t"
701 "movq (%2), %%mm1 \n\t"
702 "movq 256(%2), %%mm2 \n\t"
703 "movq 528(%2), %%mm3 \n\t"
704 "punpcklbw %%mm7, %%mm0 \n\t"
705 "punpcklbw %%mm7, %%mm1 \n\t"
706 "punpcklbw %%mm7, %%mm2 \n\t"
707 "punpcklbw %%mm7, %%mm3 \n\t"
708 "paddusw %%mm6, %%mm0 \n\t"
709 "paddusw %%mm0, %%mm1 \n\t"
710 "paddusw %%mm2, %%mm3 \n\t"
711 "paddusw %%mm1, %%mm3 \n\t"
712 "psrlw $2, %%mm3 \n\t"
713 "movq (%1), %%mm0 \n\t"
714 "movq (%2), %%mm1 \n\t"
715 "movq 256(%2), %%mm2 \n\t"
716 "movq 528(%2), %%mm4 \n\t"
717 "punpckhbw %%mm7, %%mm0 \n\t"
718 "punpckhbw %%mm7, %%mm1 \n\t"
719 "punpckhbw %%mm7, %%mm2 \n\t"
720 "punpckhbw %%mm7, %%mm4 \n\t"
721 "paddusw %%mm6, %%mm0 \n\t"
722 "paddusw %%mm0, %%mm1 \n\t"
723 "paddusw %%mm2, %%mm4 \n\t"
724 "paddusw %%mm1, %%mm4 \n\t"
725 "psrlw $2, %%mm4 \n\t"
726 "packuswb %%mm4, %%mm3 \n\t"
727 "movq (%0), %%mm4 \n\t"
728 PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
729 "movq %%mm0, (%0) \n\t"
730 "movq 8(%1), %%mm0 \n\t"
731 "movq 8(%2), %%mm1 \n\t"
732 "movq 264(%2), %%mm2 \n\t"
733 "movq 536(%2), %%mm3 \n\t"
734 "punpcklbw %%mm7, %%mm0 \n\t"
735 "punpcklbw %%mm7, %%mm1 \n\t"
736 "punpcklbw %%mm7, %%mm2 \n\t"
737 "punpcklbw %%mm7, %%mm3 \n\t"
738 "paddusw %%mm6, %%mm0 \n\t"
739 "paddusw %%mm0, %%mm1 \n\t"
740 "paddusw %%mm2, %%mm3 \n\t"
741 "paddusw %%mm1, %%mm3 \n\t"
742 "psrlw $2, %%mm3 \n\t"
743 "movq 8(%1), %%mm0 \n\t"
744 "movq 8(%2), %%mm1 \n\t"
745 "movq 264(%2), %%mm2 \n\t"
746 "movq 536(%2), %%mm4 \n\t"
747 "punpckhbw %%mm7, %%mm0 \n\t"
748 "punpckhbw %%mm7, %%mm1 \n\t"
749 "punpckhbw %%mm7, %%mm2 \n\t"
750 "punpckhbw %%mm7, %%mm4 \n\t"
751 "paddusw %%mm6, %%mm0 \n\t"
752 "paddusw %%mm0, %%mm1 \n\t"
753 "paddusw %%mm2, %%mm4 \n\t"
754 "paddusw %%mm1, %%mm4 \n\t"
755 "psrlw $2, %%mm4 \n\t"
756 "packuswb %%mm4, %%mm3 \n\t"
757 "movq 8(%0), %%mm4 \n\t"
758 PAVGB(%%mm3, %%mm4, %%mm0, %%mm5)
759 "movq %%mm0, 8(%0) \n\t"
760 "addl %4, %0 \n\t"
761 "addl %4, %1 \n\t"
762 "addl $16, %2 \n\t"
763 "decl %3 \n\t"
764 "jnz 1b \n\t"
765 :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h)
766 :"r"(stride)
767 :"memory");
768 }
769
770
771 //FIXME optimize 549 //FIXME optimize
772 static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ 550 static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
773 DEF(put, pixels8_y2)(block , pixels , line_size, h); 551 DEF(put, pixels8_y2)(block , pixels , line_size, h);
774 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); 552 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
775 } 553 }