Mercurial > libavcodec.hg
comparison i386/dsputil_mmx_rnd.h @ 984:e162c09efbe7 libavcodec
qpel fix
author | michaelni |
---|---|
date | Thu, 09 Jan 2003 20:42:37 +0000 |
parents | 274b518c4ecb |
children | 42fdf7b24d2e |
comparison
equal
deleted
inserted
replaced
983:ca2a303ea039 | 984:e162c09efbe7 |
---|---|
56 | 56 |
57 static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | 57 static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
58 { | 58 { |
59 MOVQ_BFE(mm6); | 59 MOVQ_BFE(mm6); |
60 __asm __volatile( | 60 __asm __volatile( |
61 "test $1, %0 \n\t" | |
62 " jz 1f \n\t" | |
63 "movq (%1), %%mm0 \n\t" | |
64 "movq (%2), %%mm1 \n\t" | |
65 "addl %4, %1 \n\t" | |
66 "addl $8, %2 \n\t" | |
67 PAVGB(%%mm0, %%mm1, %%mm4, %%mm6) | |
68 "movq %%mm4, (%3) \n\t" | |
69 "addl %5, %3 \n\t" | |
70 "decl %0 \n\t" | |
61 ".balign 8 \n\t" | 71 ".balign 8 \n\t" |
62 "1: \n\t" | 72 "1: \n\t" |
63 "movq (%1), %%mm0 \n\t" | 73 "movq (%1), %%mm0 \n\t" |
64 "movq (%2), %%mm1 \n\t" | 74 "movq (%2), %%mm1 \n\t" |
65 "addl %4, %1 \n\t" | 75 "addl %4, %1 \n\t" |
142 | 152 |
143 static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) | 153 static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
144 { | 154 { |
145 MOVQ_BFE(mm6); | 155 MOVQ_BFE(mm6); |
146 __asm __volatile( | 156 __asm __volatile( |
157 "test $1, %0 \n\t" | |
158 " jz 1f \n\t" | |
159 "movq (%1), %%mm0 \n\t" | |
160 "movq (%2), %%mm1 \n\t" | |
161 "movq 8(%1), %%mm2 \n\t" | |
162 "movq 8(%2), %%mm3 \n\t" | |
163 "addl %4, %1 \n\t" | |
164 "addl $16, %2 \n\t" | |
165 PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |
166 "movq %%mm4, (%3) \n\t" | |
167 "movq %%mm5, 8(%3) \n\t" | |
168 "addl %5, %3 \n\t" | |
169 "decl %0 \n\t" | |
147 ".balign 8 \n\t" | 170 ".balign 8 \n\t" |
148 "1: \n\t" | 171 "1: \n\t" |
149 "movq (%1), %%mm0 \n\t" | 172 "movq (%1), %%mm0 \n\t" |
150 "movq (%2), %%mm1 \n\t" | 173 "movq (%2), %%mm1 \n\t" |
151 "movq 8(%1), %%mm2 \n\t" | 174 "movq 8(%1), %%mm2 \n\t" |
267 "subl $2, %0 \n\t" | 290 "subl $2, %0 \n\t" |
268 "jnz 1b \n\t" | 291 "jnz 1b \n\t" |
269 :"+g"(h), "+S"(pixels) | 292 :"+g"(h), "+S"(pixels) |
270 :"D"(block), "r"(line_size) | 293 :"D"(block), "r"(line_size) |
271 :"eax", "memory"); | 294 :"eax", "memory"); |
272 } | |
273 | |
274 static void DEF(put, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h) | |
275 { | |
276 MOVQ_ZERO(mm7); | |
277 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
278 __asm __volatile( | |
279 ".balign 8 \n\t" | |
280 "1: \n\t" | |
281 "movq (%1), %%mm0 \n\t" | |
282 "movq (%2), %%mm1 \n\t" | |
283 "movq 64(%2), %%mm2 \n\t" | |
284 "movq 136(%2), %%mm3 \n\t" | |
285 "punpcklbw %%mm7, %%mm0 \n\t" | |
286 "punpcklbw %%mm7, %%mm1 \n\t" | |
287 "punpcklbw %%mm7, %%mm2 \n\t" | |
288 "punpcklbw %%mm7, %%mm3 \n\t" | |
289 "paddusw %%mm6, %%mm0 \n\t" | |
290 "paddusw %%mm0, %%mm1 \n\t" | |
291 "paddusw %%mm2, %%mm3 \n\t" | |
292 "paddusw %%mm1, %%mm3 \n\t" | |
293 "psrlw $2, %%mm3 \n\t" | |
294 "movq (%1), %%mm0 \n\t" | |
295 "movq (%2), %%mm1 \n\t" | |
296 "movq 64(%2), %%mm2 \n\t" | |
297 "movq 136(%2), %%mm4 \n\t" | |
298 "punpckhbw %%mm7, %%mm0 \n\t" | |
299 "punpckhbw %%mm7, %%mm1 \n\t" | |
300 "punpckhbw %%mm7, %%mm2 \n\t" | |
301 "punpckhbw %%mm7, %%mm4 \n\t" | |
302 "paddusw %%mm6, %%mm0 \n\t" | |
303 "paddusw %%mm0, %%mm1 \n\t" | |
304 "paddusw %%mm2, %%mm4 \n\t" | |
305 "paddusw %%mm1, %%mm4 \n\t" | |
306 "psrlw $2, %%mm4 \n\t" | |
307 "packuswb %%mm4, %%mm3 \n\t" | |
308 "movq %%mm3, (%0) \n\t" | |
309 "addl %4, %0 \n\t" | |
310 "addl %4, %1 \n\t" | |
311 "addl $8, %2 \n\t" | |
312 "decl %3 \n\t" | |
313 "jnz 1b \n\t" | |
314 :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h) | |
315 :"r"(stride) | |
316 :"memory"); | |
317 } | |
318 | |
319 static void DEF(put, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h) | |
320 { | |
321 MOVQ_ZERO(mm7); | |
322 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
323 __asm __volatile( | |
324 ".balign 8 \n\t" | |
325 "1: \n\t" | |
326 "movq (%1), %%mm0 \n\t" | |
327 "movq (%2), %%mm1 \n\t" | |
328 "movq 256(%2), %%mm2 \n\t" | |
329 "movq 528(%2), %%mm3 \n\t" | |
330 "punpcklbw %%mm7, %%mm0 \n\t" | |
331 "punpcklbw %%mm7, %%mm1 \n\t" | |
332 "punpcklbw %%mm7, %%mm2 \n\t" | |
333 "punpcklbw %%mm7, %%mm3 \n\t" | |
334 "paddusw %%mm6, %%mm0 \n\t" | |
335 "paddusw %%mm0, %%mm1 \n\t" | |
336 "paddusw %%mm2, %%mm3 \n\t" | |
337 "paddusw %%mm1, %%mm3 \n\t" | |
338 "psrlw $2, %%mm3 \n\t" | |
339 "movq (%1), %%mm0 \n\t" | |
340 "movq (%2), %%mm1 \n\t" | |
341 "movq 256(%2), %%mm2 \n\t" | |
342 "movq 528(%2), %%mm4 \n\t" | |
343 "punpckhbw %%mm7, %%mm0 \n\t" | |
344 "punpckhbw %%mm7, %%mm1 \n\t" | |
345 "punpckhbw %%mm7, %%mm2 \n\t" | |
346 "punpckhbw %%mm7, %%mm4 \n\t" | |
347 "paddusw %%mm6, %%mm0 \n\t" | |
348 "paddusw %%mm0, %%mm1 \n\t" | |
349 "paddusw %%mm2, %%mm4 \n\t" | |
350 "paddusw %%mm1, %%mm4 \n\t" | |
351 "psrlw $2, %%mm4 \n\t" | |
352 "packuswb %%mm4, %%mm3 \n\t" | |
353 "movq %%mm3, (%0) \n\t" | |
354 "movq 8(%1), %%mm0 \n\t" | |
355 "movq 8(%2), %%mm1 \n\t" | |
356 "movq 264(%2), %%mm2 \n\t" | |
357 "movq 536(%2), %%mm3 \n\t" | |
358 "punpcklbw %%mm7, %%mm0 \n\t" | |
359 "punpcklbw %%mm7, %%mm1 \n\t" | |
360 "punpcklbw %%mm7, %%mm2 \n\t" | |
361 "punpcklbw %%mm7, %%mm3 \n\t" | |
362 "paddusw %%mm6, %%mm0 \n\t" | |
363 "paddusw %%mm0, %%mm1 \n\t" | |
364 "paddusw %%mm2, %%mm3 \n\t" | |
365 "paddusw %%mm1, %%mm3 \n\t" | |
366 "psrlw $2, %%mm3 \n\t" | |
367 "movq 8(%1), %%mm0 \n\t" | |
368 "movq 8(%2), %%mm1 \n\t" | |
369 "movq 264(%2), %%mm2 \n\t" | |
370 "movq 536(%2), %%mm4 \n\t" | |
371 "punpckhbw %%mm7, %%mm0 \n\t" | |
372 "punpckhbw %%mm7, %%mm1 \n\t" | |
373 "punpckhbw %%mm7, %%mm2 \n\t" | |
374 "punpckhbw %%mm7, %%mm4 \n\t" | |
375 "paddusw %%mm6, %%mm0 \n\t" | |
376 "paddusw %%mm0, %%mm1 \n\t" | |
377 "paddusw %%mm2, %%mm4 \n\t" | |
378 "paddusw %%mm1, %%mm4 \n\t" | |
379 "psrlw $2, %%mm4 \n\t" | |
380 "packuswb %%mm4, %%mm3 \n\t" | |
381 "movq %%mm3, 8(%0) \n\t" | |
382 "addl %4, %0 \n\t" | |
383 "addl %4, %1 \n\t" | |
384 "addl $16, %2 \n\t" | |
385 "decl %3 \n\t" | |
386 "jnz 1b \n\t" | |
387 :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h) | |
388 :"r"(stride) | |
389 :"memory"); | |
390 } | 295 } |
391 | 296 |
392 // avg_pixels | 297 // avg_pixels |
393 // in case more speed is needed - unroling would certainly help | 298 // in case more speed is needed - unroling would certainly help |
394 static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) | 299 static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) |
639 :"+g"(h), "+S"(pixels) | 544 :"+g"(h), "+S"(pixels) |
640 :"D"(block), "r"(line_size) | 545 :"D"(block), "r"(line_size) |
641 :"eax", "memory"); | 546 :"eax", "memory"); |
642 } | 547 } |
643 | 548 |
644 static void DEF(avg, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h) | |
645 { | |
646 MOVQ_ZERO(mm7); | |
647 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
648 MOVQ_BFE(mm5); | |
649 __asm __volatile( | |
650 ".balign 8 \n\t" | |
651 "1: \n\t" | |
652 "movq (%1), %%mm0 \n\t" | |
653 "movq (%2), %%mm1 \n\t" | |
654 "movq 64(%2), %%mm2 \n\t" | |
655 "movq 136(%2), %%mm3 \n\t" | |
656 "punpcklbw %%mm7, %%mm0 \n\t" | |
657 "punpcklbw %%mm7, %%mm1 \n\t" | |
658 "punpcklbw %%mm7, %%mm2 \n\t" | |
659 "punpcklbw %%mm7, %%mm3 \n\t" | |
660 "paddusw %%mm6, %%mm0 \n\t" | |
661 "paddusw %%mm0, %%mm1 \n\t" | |
662 "paddusw %%mm2, %%mm3 \n\t" | |
663 "paddusw %%mm1, %%mm3 \n\t" | |
664 "psrlw $2, %%mm3 \n\t" | |
665 "movq (%1), %%mm0 \n\t" | |
666 "movq (%2), %%mm1 \n\t" | |
667 "movq 64(%2), %%mm2 \n\t" | |
668 "movq 136(%2), %%mm4 \n\t" | |
669 "punpckhbw %%mm7, %%mm0 \n\t" | |
670 "punpckhbw %%mm7, %%mm1 \n\t" | |
671 "punpckhbw %%mm7, %%mm2 \n\t" | |
672 "punpckhbw %%mm7, %%mm4 \n\t" | |
673 "paddusw %%mm6, %%mm0 \n\t" | |
674 "paddusw %%mm0, %%mm1 \n\t" | |
675 "paddusw %%mm2, %%mm4 \n\t" | |
676 "paddusw %%mm1, %%mm4 \n\t" | |
677 "psrlw $2, %%mm4 \n\t" | |
678 "packuswb %%mm4, %%mm3 \n\t" | |
679 "movq (%0), %%mm4 \n\t" | |
680 PAVGB(%%mm3, %%mm4, %%mm0, %%mm5) | |
681 "movq %%mm0, (%0) \n\t" | |
682 "addl %4, %0 \n\t" | |
683 "addl %4, %1 \n\t" | |
684 "addl $8, %2 \n\t" | |
685 "decl %3 \n\t" | |
686 "jnz 1b \n\t" | |
687 :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h) | |
688 :"r"(stride) | |
689 :"memory"); | |
690 } | |
691 | |
692 static void DEF(avg, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h) | |
693 { | |
694 MOVQ_ZERO(mm7); | |
695 SET_RND(mm6); // =2 for rnd and =1 for no_rnd version | |
696 MOVQ_BFE(mm5); | |
697 __asm __volatile( | |
698 ".balign 8 \n\t" | |
699 "1: \n\t" | |
700 "movq (%1), %%mm0 \n\t" | |
701 "movq (%2), %%mm1 \n\t" | |
702 "movq 256(%2), %%mm2 \n\t" | |
703 "movq 528(%2), %%mm3 \n\t" | |
704 "punpcklbw %%mm7, %%mm0 \n\t" | |
705 "punpcklbw %%mm7, %%mm1 \n\t" | |
706 "punpcklbw %%mm7, %%mm2 \n\t" | |
707 "punpcklbw %%mm7, %%mm3 \n\t" | |
708 "paddusw %%mm6, %%mm0 \n\t" | |
709 "paddusw %%mm0, %%mm1 \n\t" | |
710 "paddusw %%mm2, %%mm3 \n\t" | |
711 "paddusw %%mm1, %%mm3 \n\t" | |
712 "psrlw $2, %%mm3 \n\t" | |
713 "movq (%1), %%mm0 \n\t" | |
714 "movq (%2), %%mm1 \n\t" | |
715 "movq 256(%2), %%mm2 \n\t" | |
716 "movq 528(%2), %%mm4 \n\t" | |
717 "punpckhbw %%mm7, %%mm0 \n\t" | |
718 "punpckhbw %%mm7, %%mm1 \n\t" | |
719 "punpckhbw %%mm7, %%mm2 \n\t" | |
720 "punpckhbw %%mm7, %%mm4 \n\t" | |
721 "paddusw %%mm6, %%mm0 \n\t" | |
722 "paddusw %%mm0, %%mm1 \n\t" | |
723 "paddusw %%mm2, %%mm4 \n\t" | |
724 "paddusw %%mm1, %%mm4 \n\t" | |
725 "psrlw $2, %%mm4 \n\t" | |
726 "packuswb %%mm4, %%mm3 \n\t" | |
727 "movq (%0), %%mm4 \n\t" | |
728 PAVGB(%%mm3, %%mm4, %%mm0, %%mm5) | |
729 "movq %%mm0, (%0) \n\t" | |
730 "movq 8(%1), %%mm0 \n\t" | |
731 "movq 8(%2), %%mm1 \n\t" | |
732 "movq 264(%2), %%mm2 \n\t" | |
733 "movq 536(%2), %%mm3 \n\t" | |
734 "punpcklbw %%mm7, %%mm0 \n\t" | |
735 "punpcklbw %%mm7, %%mm1 \n\t" | |
736 "punpcklbw %%mm7, %%mm2 \n\t" | |
737 "punpcklbw %%mm7, %%mm3 \n\t" | |
738 "paddusw %%mm6, %%mm0 \n\t" | |
739 "paddusw %%mm0, %%mm1 \n\t" | |
740 "paddusw %%mm2, %%mm3 \n\t" | |
741 "paddusw %%mm1, %%mm3 \n\t" | |
742 "psrlw $2, %%mm3 \n\t" | |
743 "movq 8(%1), %%mm0 \n\t" | |
744 "movq 8(%2), %%mm1 \n\t" | |
745 "movq 264(%2), %%mm2 \n\t" | |
746 "movq 536(%2), %%mm4 \n\t" | |
747 "punpckhbw %%mm7, %%mm0 \n\t" | |
748 "punpckhbw %%mm7, %%mm1 \n\t" | |
749 "punpckhbw %%mm7, %%mm2 \n\t" | |
750 "punpckhbw %%mm7, %%mm4 \n\t" | |
751 "paddusw %%mm6, %%mm0 \n\t" | |
752 "paddusw %%mm0, %%mm1 \n\t" | |
753 "paddusw %%mm2, %%mm4 \n\t" | |
754 "paddusw %%mm1, %%mm4 \n\t" | |
755 "psrlw $2, %%mm4 \n\t" | |
756 "packuswb %%mm4, %%mm3 \n\t" | |
757 "movq 8(%0), %%mm4 \n\t" | |
758 PAVGB(%%mm3, %%mm4, %%mm0, %%mm5) | |
759 "movq %%mm0, 8(%0) \n\t" | |
760 "addl %4, %0 \n\t" | |
761 "addl %4, %1 \n\t" | |
762 "addl $16, %2 \n\t" | |
763 "decl %3 \n\t" | |
764 "jnz 1b \n\t" | |
765 :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h) | |
766 :"r"(stride) | |
767 :"memory"); | |
768 } | |
769 | |
770 | |
771 //FIXME optimize | 549 //FIXME optimize |
772 static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ | 550 static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ |
773 DEF(put, pixels8_y2)(block , pixels , line_size, h); | 551 DEF(put, pixels8_y2)(block , pixels , line_size, h); |
774 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); | 552 DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); |
775 } | 553 } |