comparison i386/h264dsp_mmx.c @ 8031:eebc7209c47f libavcodec

Convert asm keyword into __asm__. Neither the asm() nor the __asm__() keyword is part of the C99 standard, but while GCC accepts the former in C89 syntax, it is not accepted in C99 unless GNU extensions are turned on (with -fasm). The latter form is accepted in any syntax as an extension (without requiring further command-line options). Sun Studio C99 compiler also does not accept asm() while accepting __asm__(), albeit reporting warnings that it's not valid C99 syntax.
author flameeyes
date Thu, 16 Oct 2008 13:34:09 +0000
parents 483421b11d98
children de2509cf3c44
comparison
equal deleted inserted replaced
8030:a512ac8fa540 8031:eebc7209c47f
55 "movd "#p", (%0) \n\t" 55 "movd "#p", (%0) \n\t"
56 56
57 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) 57 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
58 { 58 {
59 /* Load dct coeffs */ 59 /* Load dct coeffs */
60 asm volatile( 60 __asm__ volatile(
61 "movq (%0), %%mm0 \n\t" 61 "movq (%0), %%mm0 \n\t"
62 "movq 8(%0), %%mm1 \n\t" 62 "movq 8(%0), %%mm1 \n\t"
63 "movq 16(%0), %%mm2 \n\t" 63 "movq 16(%0), %%mm2 \n\t"
64 "movq 24(%0), %%mm3 \n\t" 64 "movq 24(%0), %%mm3 \n\t"
65 :: "r"(block) ); 65 :: "r"(block) );
66 66
67 asm volatile( 67 __asm__ volatile(
68 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ 68 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
69 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) 69 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
70 70
71 "movq %0, %%mm6 \n\t" 71 "movq %0, %%mm6 \n\t"
72 /* in: 1,4,0,2 out: 1,2,3,0 */ 72 /* in: 1,4,0,2 out: 1,2,3,0 */
78 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) 78 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
79 79
80 "pxor %%mm7, %%mm7 \n\t" 80 "pxor %%mm7, %%mm7 \n\t"
81 :: "m"(ff_pw_32)); 81 :: "m"(ff_pw_32));
82 82
83 asm volatile( 83 __asm__ volatile(
84 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) 84 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
85 "add %1, %0 \n\t" 85 "add %1, %0 \n\t"
86 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) 86 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
87 "add %1, %0 \n\t" 87 "add %1, %0 \n\t"
88 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) 88 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
93 ); 93 );
94 } 94 }
95 95
96 static inline void h264_idct8_1d(int16_t *block) 96 static inline void h264_idct8_1d(int16_t *block)
97 { 97 {
98 asm volatile( 98 __asm__ volatile(
99 "movq 112(%0), %%mm7 \n\t" 99 "movq 112(%0), %%mm7 \n\t"
100 "movq 80(%0), %%mm0 \n\t" 100 "movq 80(%0), %%mm0 \n\t"
101 "movq 48(%0), %%mm3 \n\t" 101 "movq 48(%0), %%mm3 \n\t"
102 "movq 16(%0), %%mm5 \n\t" 102 "movq 16(%0), %%mm5 \n\t"
103 103
164 for(i=0; i<2; i++){ 164 for(i=0; i<2; i++){
165 DECLARE_ALIGNED_8(uint64_t, tmp); 165 DECLARE_ALIGNED_8(uint64_t, tmp);
166 166
167 h264_idct8_1d(block+4*i); 167 h264_idct8_1d(block+4*i);
168 168
169 asm volatile( 169 __asm__ volatile(
170 "movq %%mm7, %0 \n\t" 170 "movq %%mm7, %0 \n\t"
171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) 171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
172 "movq %%mm0, 8(%1) \n\t" 172 "movq %%mm0, 8(%1) \n\t"
173 "movq %%mm6, 24(%1) \n\t" 173 "movq %%mm6, 24(%1) \n\t"
174 "movq %%mm7, 40(%1) \n\t" 174 "movq %%mm7, 40(%1) \n\t"
186 } 186 }
187 187
188 for(i=0; i<2; i++){ 188 for(i=0; i<2; i++){
189 h264_idct8_1d(b2+4*i); 189 h264_idct8_1d(b2+4*i);
190 190
191 asm volatile( 191 __asm__ volatile(
192 "psraw $6, %%mm7 \n\t" 192 "psraw $6, %%mm7 \n\t"
193 "psraw $6, %%mm6 \n\t" 193 "psraw $6, %%mm6 \n\t"
194 "psraw $6, %%mm5 \n\t" 194 "psraw $6, %%mm5 \n\t"
195 "psraw $6, %%mm4 \n\t" 195 "psraw $6, %%mm4 \n\t"
196 "psraw $6, %%mm3 \n\t" 196 "psraw $6, %%mm3 \n\t"
267 SUMSUB_BA(h, a)\ 267 SUMSUB_BA(h, a)\
268 SUMSUB_BA(d, f) 268 SUMSUB_BA(d, f)
269 269
270 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) 270 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
271 { 271 {
272 asm volatile( 272 __asm__ volatile(
273 "movdqa 0x10(%1), %%xmm1 \n" 273 "movdqa 0x10(%1), %%xmm1 \n"
274 "movdqa 0x20(%1), %%xmm2 \n" 274 "movdqa 0x20(%1), %%xmm2 \n"
275 "movdqa 0x30(%1), %%xmm3 \n" 275 "movdqa 0x30(%1), %%xmm3 \n"
276 "movdqa 0x50(%1), %%xmm5 \n" 276 "movdqa 0x50(%1), %%xmm5 \n"
277 "movdqa 0x60(%1), %%xmm6 \n" 277 "movdqa 0x60(%1), %%xmm6 \n"
302 } 302 }
303 303
304 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) 304 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
305 { 305 {
306 int dc = (block[0] + 32) >> 6; 306 int dc = (block[0] + 32) >> 6;
307 asm volatile( 307 __asm__ volatile(
308 "movd %0, %%mm0 \n\t" 308 "movd %0, %%mm0 \n\t"
309 "pshufw $0, %%mm0, %%mm0 \n\t" 309 "pshufw $0, %%mm0, %%mm0 \n\t"
310 "pxor %%mm1, %%mm1 \n\t" 310 "pxor %%mm1, %%mm1 \n\t"
311 "psubw %%mm0, %%mm1 \n\t" 311 "psubw %%mm0, %%mm1 \n\t"
312 "packuswb %%mm0, %%mm0 \n\t" 312 "packuswb %%mm0, %%mm0 \n\t"
313 "packuswb %%mm1, %%mm1 \n\t" 313 "packuswb %%mm1, %%mm1 \n\t"
314 ::"r"(dc) 314 ::"r"(dc)
315 ); 315 );
316 asm volatile( 316 __asm__ volatile(
317 "movd %0, %%mm2 \n\t" 317 "movd %0, %%mm2 \n\t"
318 "movd %1, %%mm3 \n\t" 318 "movd %1, %%mm3 \n\t"
319 "movd %2, %%mm4 \n\t" 319 "movd %2, %%mm4 \n\t"
320 "movd %3, %%mm5 \n\t" 320 "movd %3, %%mm5 \n\t"
321 "paddusb %%mm0, %%mm2 \n\t" 321 "paddusb %%mm0, %%mm2 \n\t"
339 339
340 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) 340 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
341 { 341 {
342 int dc = (block[0] + 32) >> 6; 342 int dc = (block[0] + 32) >> 6;
343 int y; 343 int y;
344 asm volatile( 344 __asm__ volatile(
345 "movd %0, %%mm0 \n\t" 345 "movd %0, %%mm0 \n\t"
346 "pshufw $0, %%mm0, %%mm0 \n\t" 346 "pshufw $0, %%mm0, %%mm0 \n\t"
347 "pxor %%mm1, %%mm1 \n\t" 347 "pxor %%mm1, %%mm1 \n\t"
348 "psubw %%mm0, %%mm1 \n\t" 348 "psubw %%mm0, %%mm1 \n\t"
349 "packuswb %%mm0, %%mm0 \n\t" 349 "packuswb %%mm0, %%mm0 \n\t"
350 "packuswb %%mm1, %%mm1 \n\t" 350 "packuswb %%mm1, %%mm1 \n\t"
351 ::"r"(dc) 351 ::"r"(dc)
352 ); 352 );
353 for(y=2; y--; dst += 4*stride){ 353 for(y=2; y--; dst += 4*stride){
354 asm volatile( 354 __asm__ volatile(
355 "movq %0, %%mm2 \n\t" 355 "movq %0, %%mm2 \n\t"
356 "movq %1, %%mm3 \n\t" 356 "movq %1, %%mm3 \n\t"
357 "movq %2, %%mm4 \n\t" 357 "movq %2, %%mm4 \n\t"
358 "movq %3, %%mm5 \n\t" 358 "movq %3, %%mm5 \n\t"
359 "paddusb %%mm0, %%mm2 \n\t" 359 "paddusb %%mm0, %%mm2 \n\t"
461 461
462 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 462 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
463 { 463 {
464 DECLARE_ALIGNED_8(uint64_t, tmp0[2]); 464 DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
465 465
466 asm volatile( 466 __asm__ volatile(
467 "movq (%1,%3), %%mm0 \n\t" //p1 467 "movq (%1,%3), %%mm0 \n\t" //p1
468 "movq (%1,%3,2), %%mm1 \n\t" //p0 468 "movq (%1,%3,2), %%mm1 \n\t" //p0
469 "movq (%2), %%mm2 \n\t" //q0 469 "movq (%2), %%mm2 \n\t" //q0
470 "movq (%2,%3), %%mm3 \n\t" //q1 470 "movq (%2,%3), %%mm3 \n\t" //q1
471 H264_DEBLOCK_MASK(%6, %7) 471 H264_DEBLOCK_MASK(%6, %7)
538 } 538 }
539 } 539 }
540 540
541 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 541 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
542 { 542 {
543 asm volatile( 543 __asm__ volatile(
544 "movq (%0), %%mm0 \n\t" //p1 544 "movq (%0), %%mm0 \n\t" //p1
545 "movq (%0,%2), %%mm1 \n\t" //p0 545 "movq (%0,%2), %%mm1 \n\t" //p0
546 "movq (%1), %%mm2 \n\t" //q0 546 "movq (%1), %%mm2 \n\t" //q0
547 "movq (%1,%2), %%mm3 \n\t" //q1 547 "movq (%1,%2), %%mm3 \n\t" //q1
548 H264_DEBLOCK_MASK(%4, %5) 548 H264_DEBLOCK_MASK(%4, %5)
584 "psubusb %%mm4, "#p0" \n\t"\ 584 "psubusb %%mm4, "#p0" \n\t"\
585 "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ 585 "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\
586 586
587 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) 587 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
588 { 588 {
589 asm volatile( 589 __asm__ volatile(
590 "movq (%0), %%mm0 \n\t" 590 "movq (%0), %%mm0 \n\t"
591 "movq (%0,%2), %%mm1 \n\t" 591 "movq (%0,%2), %%mm1 \n\t"
592 "movq (%1), %%mm2 \n\t" 592 "movq (%1), %%mm2 \n\t"
593 "movq (%1,%2), %%mm3 \n\t" 593 "movq (%1,%2), %%mm3 \n\t"
594 H264_DEBLOCK_MASK(%3, %4) 594 H264_DEBLOCK_MASK(%3, %4)
626 } 626 }
627 627
628 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], 628 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
629 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { 629 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
630 int dir; 630 int dir;
631 asm volatile( 631 __asm__ volatile(
632 "pxor %%mm7, %%mm7 \n\t" 632 "pxor %%mm7, %%mm7 \n\t"
633 "movq %0, %%mm6 \n\t" 633 "movq %0, %%mm6 \n\t"
634 "movq %1, %%mm5 \n\t" 634 "movq %1, %%mm5 \n\t"
635 "movq %2, %%mm4 \n\t" 635 "movq %2, %%mm4 \n\t"
636 ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) 636 ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
637 ); 637 );
638 if(field) 638 if(field)
639 asm volatile( 639 __asm__ volatile(
640 "movq %0, %%mm5 \n\t" 640 "movq %0, %%mm5 \n\t"
641 "movq %1, %%mm4 \n\t" 641 "movq %1, %%mm4 \n\t"
642 ::"m"(ff_pb_3_1), "m"(ff_pb_7_3) 642 ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
643 ); 643 );
644 644
648 const int d_idx = dir ? -8 : -1; 648 const int d_idx = dir ? -8 : -1;
649 const int mask_mv = dir ? mask_mv1 : mask_mv0; 649 const int mask_mv = dir ? mask_mv1 : mask_mv0;
650 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; 650 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
651 int b_idx, edge, l; 651 int b_idx, edge, l;
652 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { 652 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
653 asm volatile( 653 __asm__ volatile(
654 "pand %0, %%mm0 \n\t" 654 "pand %0, %%mm0 \n\t"
655 ::"m"(mask_dir) 655 ::"m"(mask_dir)
656 ); 656 );
657 if(!(mask_mv & edge)) { 657 if(!(mask_mv & edge)) {
658 asm volatile("pxor %%mm0, %%mm0 \n\t":); 658 __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
659 for( l = bidir; l >= 0; l-- ) { 659 for( l = bidir; l >= 0; l-- ) {
660 asm volatile( 660 __asm__ volatile(
661 "movd %0, %%mm1 \n\t" 661 "movd %0, %%mm1 \n\t"
662 "punpckldq %1, %%mm1 \n\t" 662 "punpckldq %1, %%mm1 \n\t"
663 "movq %%mm1, %%mm2 \n\t" 663 "movq %%mm1, %%mm2 \n\t"
664 "psrlw $7, %%mm2 \n\t" 664 "psrlw $7, %%mm2 \n\t"
665 "pand %%mm6, %%mm2 \n\t" 665 "pand %%mm6, %%mm2 \n\t"
686 "m"(mv[l][b_idx+d_idx][0]), 686 "m"(mv[l][b_idx+d_idx][0]),
687 "m"(mv[l][b_idx+d_idx+2][0]) 687 "m"(mv[l][b_idx+d_idx+2][0])
688 ); 688 );
689 } 689 }
690 } 690 }
691 asm volatile( 691 __asm__ volatile(
692 "movd %0, %%mm1 \n\t" 692 "movd %0, %%mm1 \n\t"
693 "por %1, %%mm1 \n\t" 693 "por %1, %%mm1 \n\t"
694 "punpcklbw %%mm7, %%mm1 \n\t" 694 "punpcklbw %%mm7, %%mm1 \n\t"
695 "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] 695 "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn]
696 ::"m"(nnz[b_idx]), 696 ::"m"(nnz[b_idx]),
697 "m"(nnz[b_idx+d_idx]) 697 "m"(nnz[b_idx+d_idx])
698 ); 698 );
699 asm volatile( 699 __asm__ volatile(
700 "pcmpeqw %%mm7, %%mm0 \n\t" 700 "pcmpeqw %%mm7, %%mm0 \n\t"
701 "pcmpeqw %%mm7, %%mm0 \n\t" 701 "pcmpeqw %%mm7, %%mm0 \n\t"
702 "psrlw $15, %%mm0 \n\t" // nonzero -> 1 702 "psrlw $15, %%mm0 \n\t" // nonzero -> 1
703 "psrlw $14, %%mm1 \n\t" 703 "psrlw $14, %%mm1 \n\t"
704 "movq %%mm0, %%mm2 \n\t" 704 "movq %%mm0, %%mm2 \n\t"
711 ); 711 );
712 } 712 }
713 edges = 4; 713 edges = 4;
714 step = 1; 714 step = 1;
715 } 715 }
716 asm volatile( 716 __asm__ volatile(
717 "movq (%0), %%mm0 \n\t" 717 "movq (%0), %%mm0 \n\t"
718 "movq 8(%0), %%mm1 \n\t" 718 "movq 8(%0), %%mm1 \n\t"
719 "movq 16(%0), %%mm2 \n\t" 719 "movq 16(%0), %%mm2 \n\t"
720 "movq 24(%0), %%mm3 \n\t" 720 "movq 24(%0), %%mm3 \n\t"
721 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) 721 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
772 772
773 #define QPEL_H264(OPNAME, OP, MMX)\ 773 #define QPEL_H264(OPNAME, OP, MMX)\
774 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 774 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
775 int h=4;\ 775 int h=4;\
776 \ 776 \
777 asm volatile(\ 777 __asm__ volatile(\
778 "pxor %%mm7, %%mm7 \n\t"\ 778 "pxor %%mm7, %%mm7 \n\t"\
779 "movq %5, %%mm4 \n\t"\ 779 "movq %5, %%mm4 \n\t"\
780 "movq %6, %%mm5 \n\t"\ 780 "movq %6, %%mm5 \n\t"\
781 "1: \n\t"\ 781 "1: \n\t"\
782 "movd -1(%0), %%mm1 \n\t"\ 782 "movd -1(%0), %%mm1 \n\t"\
811 : "memory"\ 811 : "memory"\
812 );\ 812 );\
813 }\ 813 }\
814 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 814 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
815 int h=4;\ 815 int h=4;\
816 asm volatile(\ 816 __asm__ volatile(\
817 "pxor %%mm7, %%mm7 \n\t"\ 817 "pxor %%mm7, %%mm7 \n\t"\
818 "movq %0, %%mm4 \n\t"\ 818 "movq %0, %%mm4 \n\t"\
819 "movq %1, %%mm5 \n\t"\ 819 "movq %1, %%mm5 \n\t"\
820 :: "m"(ff_pw_5), "m"(ff_pw_16)\ 820 :: "m"(ff_pw_5), "m"(ff_pw_16)\
821 );\ 821 );\
822 do{\ 822 do{\
823 asm volatile(\ 823 __asm__ volatile(\
824 "movd -1(%0), %%mm1 \n\t"\ 824 "movd -1(%0), %%mm1 \n\t"\
825 "movd (%0), %%mm2 \n\t"\ 825 "movd (%0), %%mm2 \n\t"\
826 "movd 1(%0), %%mm3 \n\t"\ 826 "movd 1(%0), %%mm3 \n\t"\
827 "movd 2(%0), %%mm0 \n\t"\ 827 "movd 2(%0), %%mm0 \n\t"\
828 "punpcklbw %%mm7, %%mm1 \n\t"\ 828 "punpcklbw %%mm7, %%mm1 \n\t"\
855 );\ 855 );\
856 }while(--h);\ 856 }while(--h);\
857 }\ 857 }\
858 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 858 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
859 src -= 2*srcStride;\ 859 src -= 2*srcStride;\
860 asm volatile(\ 860 __asm__ volatile(\
861 "pxor %%mm7, %%mm7 \n\t"\ 861 "pxor %%mm7, %%mm7 \n\t"\
862 "movd (%0), %%mm0 \n\t"\ 862 "movd (%0), %%mm0 \n\t"\
863 "add %2, %0 \n\t"\ 863 "add %2, %0 \n\t"\
864 "movd (%0), %%mm1 \n\t"\ 864 "movd (%0), %%mm1 \n\t"\
865 "add %2, %0 \n\t"\ 865 "add %2, %0 \n\t"\
887 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ 887 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
888 int h=4;\ 888 int h=4;\
889 int w=3;\ 889 int w=3;\
890 src -= 2*srcStride+2;\ 890 src -= 2*srcStride+2;\
891 while(w--){\ 891 while(w--){\
892 asm volatile(\ 892 __asm__ volatile(\
893 "pxor %%mm7, %%mm7 \n\t"\ 893 "pxor %%mm7, %%mm7 \n\t"\
894 "movd (%0), %%mm0 \n\t"\ 894 "movd (%0), %%mm0 \n\t"\
895 "add %2, %0 \n\t"\ 895 "add %2, %0 \n\t"\
896 "movd (%0), %%mm1 \n\t"\ 896 "movd (%0), %%mm1 \n\t"\
897 "add %2, %0 \n\t"\ 897 "add %2, %0 \n\t"\
917 );\ 917 );\
918 tmp += 4;\ 918 tmp += 4;\
919 src += 4 - 9*srcStride;\ 919 src += 4 - 9*srcStride;\
920 }\ 920 }\
921 tmp -= 3*4;\ 921 tmp -= 3*4;\
922 asm volatile(\ 922 __asm__ volatile(\
923 "1: \n\t"\ 923 "1: \n\t"\
924 "movq (%0), %%mm0 \n\t"\ 924 "movq (%0), %%mm0 \n\t"\
925 "paddw 10(%0), %%mm0 \n\t"\ 925 "paddw 10(%0), %%mm0 \n\t"\
926 "movq 2(%0), %%mm1 \n\t"\ 926 "movq 2(%0), %%mm1 \n\t"\
927 "paddw 8(%0), %%mm1 \n\t"\ 927 "paddw 8(%0), %%mm1 \n\t"\
946 );\ 946 );\
947 }\ 947 }\
948 \ 948 \
949 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 949 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
950 int h=8;\ 950 int h=8;\
951 asm volatile(\ 951 __asm__ volatile(\
952 "pxor %%mm7, %%mm7 \n\t"\ 952 "pxor %%mm7, %%mm7 \n\t"\
953 "movq %5, %%mm6 \n\t"\ 953 "movq %5, %%mm6 \n\t"\
954 "1: \n\t"\ 954 "1: \n\t"\
955 "movq (%0), %%mm0 \n\t"\ 955 "movq (%0), %%mm0 \n\t"\
956 "movq 1(%0), %%mm2 \n\t"\ 956 "movq 1(%0), %%mm2 \n\t"\
1003 );\ 1003 );\
1004 }\ 1004 }\
1005 \ 1005 \
1006 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1006 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1007 int h=8;\ 1007 int h=8;\
1008 asm volatile(\ 1008 __asm__ volatile(\
1009 "pxor %%mm7, %%mm7 \n\t"\ 1009 "pxor %%mm7, %%mm7 \n\t"\
1010 "movq %0, %%mm6 \n\t"\ 1010 "movq %0, %%mm6 \n\t"\
1011 :: "m"(ff_pw_5)\ 1011 :: "m"(ff_pw_5)\
1012 );\ 1012 );\
1013 do{\ 1013 do{\
1014 asm volatile(\ 1014 __asm__ volatile(\
1015 "movq (%0), %%mm0 \n\t"\ 1015 "movq (%0), %%mm0 \n\t"\
1016 "movq 1(%0), %%mm2 \n\t"\ 1016 "movq 1(%0), %%mm2 \n\t"\
1017 "movq %%mm0, %%mm1 \n\t"\ 1017 "movq %%mm0, %%mm1 \n\t"\
1018 "movq %%mm2, %%mm3 \n\t"\ 1018 "movq %%mm2, %%mm3 \n\t"\
1019 "punpcklbw %%mm7, %%mm0 \n\t"\ 1019 "punpcklbw %%mm7, %%mm0 \n\t"\
1069 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1069 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1070 int w= 2;\ 1070 int w= 2;\
1071 src -= 2*srcStride;\ 1071 src -= 2*srcStride;\
1072 \ 1072 \
1073 while(w--){\ 1073 while(w--){\
1074 asm volatile(\ 1074 __asm__ volatile(\
1075 "pxor %%mm7, %%mm7 \n\t"\ 1075 "pxor %%mm7, %%mm7 \n\t"\
1076 "movd (%0), %%mm0 \n\t"\ 1076 "movd (%0), %%mm0 \n\t"\
1077 "add %2, %0 \n\t"\ 1077 "add %2, %0 \n\t"\
1078 "movd (%0), %%mm1 \n\t"\ 1078 "movd (%0), %%mm1 \n\t"\
1079 "add %2, %0 \n\t"\ 1079 "add %2, %0 \n\t"\
1100 : "+a"(src), "+c"(dst)\ 1100 : "+a"(src), "+c"(dst)\
1101 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1101 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1102 : "memory"\ 1102 : "memory"\
1103 );\ 1103 );\
1104 if(h==16){\ 1104 if(h==16){\
1105 asm volatile(\ 1105 __asm__ volatile(\
1106 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ 1106 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
1107 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ 1107 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
1108 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ 1108 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
1109 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ 1109 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
1110 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ 1110 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
1123 }\ 1123 }\
1124 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ 1124 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
1125 int w = (size+8)>>2;\ 1125 int w = (size+8)>>2;\
1126 src -= 2*srcStride+2;\ 1126 src -= 2*srcStride+2;\
1127 while(w--){\ 1127 while(w--){\
1128 asm volatile(\ 1128 __asm__ volatile(\
1129 "pxor %%mm7, %%mm7 \n\t"\ 1129 "pxor %%mm7, %%mm7 \n\t"\
1130 "movd (%0), %%mm0 \n\t"\ 1130 "movd (%0), %%mm0 \n\t"\
1131 "add %2, %0 \n\t"\ 1131 "add %2, %0 \n\t"\
1132 "movd (%0), %%mm1 \n\t"\ 1132 "movd (%0), %%mm1 \n\t"\
1133 "add %2, %0 \n\t"\ 1133 "add %2, %0 \n\t"\
1153 : "+a"(src)\ 1153 : "+a"(src)\
1154 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1154 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1155 : "memory"\ 1155 : "memory"\
1156 );\ 1156 );\
1157 if(size==16){\ 1157 if(size==16){\
1158 asm volatile(\ 1158 __asm__ volatile(\
1159 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ 1159 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
1160 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ 1160 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
1161 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ 1161 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
1162 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ 1162 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
1163 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ 1163 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
1175 }\ 1175 }\
1176 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 1176 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
1177 int w = size>>4;\ 1177 int w = size>>4;\
1178 do{\ 1178 do{\
1179 int h = size;\ 1179 int h = size;\
1180 asm volatile(\ 1180 __asm__ volatile(\
1181 "1: \n\t"\ 1181 "1: \n\t"\
1182 "movq (%0), %%mm0 \n\t"\ 1182 "movq (%0), %%mm0 \n\t"\
1183 "movq 8(%0), %%mm3 \n\t"\ 1183 "movq 8(%0), %%mm3 \n\t"\
1184 "movq 2(%0), %%mm1 \n\t"\ 1184 "movq 2(%0), %%mm1 \n\t"\
1185 "movq 10(%0), %%mm4 \n\t"\ 1185 "movq 10(%0), %%mm4 \n\t"\
1259 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ 1259 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
1260 }\ 1260 }\
1261 \ 1261 \
1262 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 1262 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
1263 {\ 1263 {\
1264 asm volatile(\ 1264 __asm__ volatile(\
1265 "movq (%1), %%mm0 \n\t"\ 1265 "movq (%1), %%mm0 \n\t"\
1266 "movq 24(%1), %%mm1 \n\t"\ 1266 "movq 24(%1), %%mm1 \n\t"\
1267 "psraw $5, %%mm0 \n\t"\ 1267 "psraw $5, %%mm0 \n\t"\
1268 "psraw $5, %%mm1 \n\t"\ 1268 "psraw $5, %%mm1 \n\t"\
1269 "packuswb %%mm0, %%mm0 \n\t"\ 1269 "packuswb %%mm0, %%mm0 \n\t"\
1289 :"memory");\ 1289 :"memory");\
1290 }\ 1290 }\
1291 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ 1291 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
1292 {\ 1292 {\
1293 do{\ 1293 do{\
1294 asm volatile(\ 1294 __asm__ volatile(\
1295 "movq (%1), %%mm0 \n\t"\ 1295 "movq (%1), %%mm0 \n\t"\
1296 "movq 8(%1), %%mm1 \n\t"\ 1296 "movq 8(%1), %%mm1 \n\t"\
1297 "movq 48(%1), %%mm2 \n\t"\ 1297 "movq 48(%1), %%mm2 \n\t"\
1298 "movq 8+48(%1), %%mm3 \n\t"\ 1298 "movq 8+48(%1), %%mm3 \n\t"\
1299 "psraw $5, %%mm0 \n\t"\ 1299 "psraw $5, %%mm0 \n\t"\
1323 1323
1324 #ifdef ARCH_X86_64 1324 #ifdef ARCH_X86_64
1325 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 1325 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
1326 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1326 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1327 int h=16;\ 1327 int h=16;\
1328 asm volatile(\ 1328 __asm__ volatile(\
1329 "pxor %%xmm15, %%xmm15 \n\t"\ 1329 "pxor %%xmm15, %%xmm15 \n\t"\
1330 "movdqa %6, %%xmm14 \n\t"\ 1330 "movdqa %6, %%xmm14 \n\t"\
1331 "movdqa %7, %%xmm13 \n\t"\ 1331 "movdqa %7, %%xmm13 \n\t"\
1332 "1: \n\t"\ 1332 "1: \n\t"\
1333 "lddqu 3(%0), %%xmm1 \n\t"\ 1333 "lddqu 3(%0), %%xmm1 \n\t"\
1401 #endif // ARCH_X86_64 1401 #endif // ARCH_X86_64
1402 1402
1403 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ 1403 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
1404 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ 1404 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
1405 int h=8;\ 1405 int h=8;\
1406 asm volatile(\ 1406 __asm__ volatile(\
1407 "pxor %%xmm7, %%xmm7 \n\t"\ 1407 "pxor %%xmm7, %%xmm7 \n\t"\
1408 "movdqa %0, %%xmm6 \n\t"\ 1408 "movdqa %0, %%xmm6 \n\t"\
1409 :: "m"(ff_pw_5)\ 1409 :: "m"(ff_pw_5)\
1410 );\ 1410 );\
1411 do{\ 1411 do{\
1412 asm volatile(\ 1412 __asm__ volatile(\
1413 "lddqu -5(%0), %%xmm1 \n\t"\ 1413 "lddqu -5(%0), %%xmm1 \n\t"\
1414 "movdqa %%xmm1, %%xmm0 \n\t"\ 1414 "movdqa %%xmm1, %%xmm0 \n\t"\
1415 "punpckhbw %%xmm7, %%xmm1 \n\t"\ 1415 "punpckhbw %%xmm7, %%xmm1 \n\t"\
1416 "punpcklbw %%xmm7, %%xmm0 \n\t"\ 1416 "punpcklbw %%xmm7, %%xmm0 \n\t"\
1417 "movdqa %%xmm1, %%xmm2 \n\t"\ 1417 "movdqa %%xmm1, %%xmm2 \n\t"\
1448 }\ 1448 }\
1449 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ 1449 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
1450 \ 1450 \
1451 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ 1451 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1452 int h=8;\ 1452 int h=8;\
1453 asm volatile(\ 1453 __asm__ volatile(\
1454 "pxor %%xmm7, %%xmm7 \n\t"\ 1454 "pxor %%xmm7, %%xmm7 \n\t"\
1455 "movdqa %5, %%xmm6 \n\t"\ 1455 "movdqa %5, %%xmm6 \n\t"\
1456 "1: \n\t"\ 1456 "1: \n\t"\
1457 "lddqu -5(%0), %%xmm1 \n\t"\ 1457 "lddqu -5(%0), %%xmm1 \n\t"\
1458 "movdqa %%xmm1, %%xmm0 \n\t"\ 1458 "movdqa %%xmm1, %%xmm0 \n\t"\
1499 1499
1500 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ 1500 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
1501 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ 1501 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1502 src -= 2*srcStride;\ 1502 src -= 2*srcStride;\
1503 \ 1503 \
1504 asm volatile(\ 1504 __asm__ volatile(\
1505 "pxor %%xmm7, %%xmm7 \n\t"\ 1505 "pxor %%xmm7, %%xmm7 \n\t"\
1506 "movq (%0), %%xmm0 \n\t"\ 1506 "movq (%0), %%xmm0 \n\t"\
1507 "add %2, %0 \n\t"\ 1507 "add %2, %0 \n\t"\
1508 "movq (%0), %%xmm1 \n\t"\ 1508 "movq (%0), %%xmm1 \n\t"\
1509 "add %2, %0 \n\t"\ 1509 "add %2, %0 \n\t"\
1530 : "+a"(src), "+c"(dst)\ 1530 : "+a"(src), "+c"(dst)\
1531 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ 1531 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
1532 : "memory"\ 1532 : "memory"\
1533 );\ 1533 );\
1534 if(h==16){\ 1534 if(h==16){\
1535 asm volatile(\ 1535 __asm__ volatile(\
1536 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ 1536 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
1537 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ 1537 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
1538 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ 1538 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
1539 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ 1539 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
1540 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ 1540 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
1558 1558
1559 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ 1559 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
1560 int w = (size+8)>>3; 1560 int w = (size+8)>>3;
1561 src -= 2*srcStride+2; 1561 src -= 2*srcStride+2;
1562 while(w--){ 1562 while(w--){
1563 asm volatile( 1563 __asm__ volatile(
1564 "pxor %%xmm7, %%xmm7 \n\t" 1564 "pxor %%xmm7, %%xmm7 \n\t"
1565 "movq (%0), %%xmm0 \n\t" 1565 "movq (%0), %%xmm0 \n\t"
1566 "add %2, %0 \n\t" 1566 "add %2, %0 \n\t"
1567 "movq (%0), %%xmm1 \n\t" 1567 "movq (%0), %%xmm1 \n\t"
1568 "add %2, %0 \n\t" 1568 "add %2, %0 \n\t"
1588 : "+a"(src) 1588 : "+a"(src)
1589 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) 1589 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
1590 : "memory" 1590 : "memory"
1591 ); 1591 );
1592 if(size==16){ 1592 if(size==16){
1593 asm volatile( 1593 __asm__ volatile(
1594 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) 1594 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
1595 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) 1595 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
1596 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) 1596 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
1597 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) 1597 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
1598 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) 1598 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
1611 1611
1612 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ 1612 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
1613 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ 1613 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
1614 int h = size;\ 1614 int h = size;\
1615 if(size == 16){\ 1615 if(size == 16){\
1616 asm volatile(\ 1616 __asm__ volatile(\
1617 "1: \n\t"\ 1617 "1: \n\t"\
1618 "movdqa 32(%0), %%xmm4 \n\t"\ 1618 "movdqa 32(%0), %%xmm4 \n\t"\
1619 "movdqa 16(%0), %%xmm5 \n\t"\ 1619 "movdqa 16(%0), %%xmm5 \n\t"\
1620 "movdqa (%0), %%xmm7 \n\t"\ 1620 "movdqa (%0), %%xmm7 \n\t"\
1621 "movdqa %%xmm4, %%xmm3 \n\t"\ 1621 "movdqa %%xmm4, %%xmm3 \n\t"\
1666 : "+a"(tmp), "+c"(dst), "+g"(h)\ 1666 : "+a"(tmp), "+c"(dst), "+g"(h)\
1667 : "S"((x86_reg)dstStride)\ 1667 : "S"((x86_reg)dstStride)\
1668 : "memory"\ 1668 : "memory"\
1669 );\ 1669 );\
1670 }else{\ 1670 }else{\
1671 asm volatile(\ 1671 __asm__ volatile(\
1672 "1: \n\t"\ 1672 "1: \n\t"\
1673 "movdqa 16(%0), %%xmm1 \n\t"\ 1673 "movdqa 16(%0), %%xmm1 \n\t"\
1674 "movdqa (%0), %%xmm0 \n\t"\ 1674 "movdqa (%0), %%xmm0 \n\t"\
1675 "movdqa %%xmm1, %%xmm2 \n\t"\ 1675 "movdqa %%xmm1, %%xmm2 \n\t"\
1676 "movdqa %%xmm1, %%xmm3 \n\t"\ 1676 "movdqa %%xmm1, %%xmm3 \n\t"\
2020 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) 2020 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
2021 { 2021 {
2022 int x, y; 2022 int x, y;
2023 offset <<= log2_denom; 2023 offset <<= log2_denom;
2024 offset += (1 << log2_denom) >> 1; 2024 offset += (1 << log2_denom) >> 1;
2025 asm volatile( 2025 __asm__ volatile(
2026 "movd %0, %%mm4 \n\t" 2026 "movd %0, %%mm4 \n\t"
2027 "movd %1, %%mm5 \n\t" 2027 "movd %1, %%mm5 \n\t"
2028 "movd %2, %%mm6 \n\t" 2028 "movd %2, %%mm6 \n\t"
2029 "pshufw $0, %%mm4, %%mm4 \n\t" 2029 "pshufw $0, %%mm4, %%mm4 \n\t"
2030 "pshufw $0, %%mm5, %%mm5 \n\t" 2030 "pshufw $0, %%mm5, %%mm5 \n\t"
2031 "pxor %%mm7, %%mm7 \n\t" 2031 "pxor %%mm7, %%mm7 \n\t"
2032 :: "g"(weight), "g"(offset), "g"(log2_denom) 2032 :: "g"(weight), "g"(offset), "g"(log2_denom)
2033 ); 2033 );
2034 for(y=0; y<h; y+=2){ 2034 for(y=0; y<h; y+=2){
2035 for(x=0; x<w; x+=4){ 2035 for(x=0; x<w; x+=4){
2036 asm volatile( 2036 __asm__ volatile(
2037 "movd %0, %%mm0 \n\t" 2037 "movd %0, %%mm0 \n\t"
2038 "movd %1, %%mm1 \n\t" 2038 "movd %1, %%mm1 \n\t"
2039 "punpcklbw %%mm7, %%mm0 \n\t" 2039 "punpcklbw %%mm7, %%mm0 \n\t"
2040 "punpcklbw %%mm7, %%mm1 \n\t" 2040 "punpcklbw %%mm7, %%mm1 \n\t"
2041 "pmullw %%mm4, %%mm0 \n\t" 2041 "pmullw %%mm4, %%mm0 \n\t"
2058 2058
2059 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) 2059 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
2060 { 2060 {
2061 int x, y; 2061 int x, y;
2062 offset = ((offset + 1) | 1) << log2_denom; 2062 offset = ((offset + 1) | 1) << log2_denom;
2063 asm volatile( 2063 __asm__ volatile(
2064 "movd %0, %%mm3 \n\t" 2064 "movd %0, %%mm3 \n\t"
2065 "movd %1, %%mm4 \n\t" 2065 "movd %1, %%mm4 \n\t"
2066 "movd %2, %%mm5 \n\t" 2066 "movd %2, %%mm5 \n\t"
2067 "movd %3, %%mm6 \n\t" 2067 "movd %3, %%mm6 \n\t"
2068 "pshufw $0, %%mm3, %%mm3 \n\t" 2068 "pshufw $0, %%mm3, %%mm3 \n\t"
2071 "pxor %%mm7, %%mm7 \n\t" 2071 "pxor %%mm7, %%mm7 \n\t"
2072 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) 2072 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
2073 ); 2073 );
2074 for(y=0; y<h; y++){ 2074 for(y=0; y<h; y++){
2075 for(x=0; x<w; x+=4){ 2075 for(x=0; x<w; x+=4){
2076 asm volatile( 2076 __asm__ volatile(
2077 "movd %0, %%mm0 \n\t" 2077 "movd %0, %%mm0 \n\t"
2078 "movd %1, %%mm1 \n\t" 2078 "movd %1, %%mm1 \n\t"
2079 "punpcklbw %%mm7, %%mm0 \n\t" 2079 "punpcklbw %%mm7, %%mm0 \n\t"
2080 "punpcklbw %%mm7, %%mm1 \n\t" 2080 "punpcklbw %%mm7, %%mm1 \n\t"
2081 "pmullw %%mm3, %%mm0 \n\t" 2081 "pmullw %%mm3, %%mm0 \n\t"