Mercurial > libavcodec.hg
comparison i386/h264dsp_mmx.c @ 8031:eebc7209c47f libavcodec
Convert asm keyword into __asm__.
Neither the asm() nor the __asm__() keyword is part of the C99
standard, but while GCC accepts the former in C89 syntax, it is not
accepted in C99 unless GNU extensions are turned on (with -fasm). The
latter form is accepted in any syntax as an extension (without
requiring further command-line options).
Sun Studio C99 compiler also does not accept asm() while accepting
__asm__(), albeit reporting warnings that it's not valid C99 syntax.
author | flameeyes |
---|---|
date | Thu, 16 Oct 2008 13:34:09 +0000 |
parents | 483421b11d98 |
children | de2509cf3c44 |
comparison
equal
deleted
inserted
replaced
8030:a512ac8fa540 | 8031:eebc7209c47f |
---|---|
55 "movd "#p", (%0) \n\t" | 55 "movd "#p", (%0) \n\t" |
56 | 56 |
57 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) | 57 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride) |
58 { | 58 { |
59 /* Load dct coeffs */ | 59 /* Load dct coeffs */ |
60 asm volatile( | 60 __asm__ volatile( |
61 "movq (%0), %%mm0 \n\t" | 61 "movq (%0), %%mm0 \n\t" |
62 "movq 8(%0), %%mm1 \n\t" | 62 "movq 8(%0), %%mm1 \n\t" |
63 "movq 16(%0), %%mm2 \n\t" | 63 "movq 16(%0), %%mm2 \n\t" |
64 "movq 24(%0), %%mm3 \n\t" | 64 "movq 24(%0), %%mm3 \n\t" |
65 :: "r"(block) ); | 65 :: "r"(block) ); |
66 | 66 |
67 asm volatile( | 67 __asm__ volatile( |
68 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ | 68 /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */ |
69 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) | 69 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 ) |
70 | 70 |
71 "movq %0, %%mm6 \n\t" | 71 "movq %0, %%mm6 \n\t" |
72 /* in: 1,4,0,2 out: 1,2,3,0 */ | 72 /* in: 1,4,0,2 out: 1,2,3,0 */ |
78 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) | 78 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 ) |
79 | 79 |
80 "pxor %%mm7, %%mm7 \n\t" | 80 "pxor %%mm7, %%mm7 \n\t" |
81 :: "m"(ff_pw_32)); | 81 :: "m"(ff_pw_32)); |
82 | 82 |
83 asm volatile( | 83 __asm__ volatile( |
84 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) | 84 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7) |
85 "add %1, %0 \n\t" | 85 "add %1, %0 \n\t" |
86 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) | 86 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7) |
87 "add %1, %0 \n\t" | 87 "add %1, %0 \n\t" |
88 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) | 88 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7) |
93 ); | 93 ); |
94 } | 94 } |
95 | 95 |
96 static inline void h264_idct8_1d(int16_t *block) | 96 static inline void h264_idct8_1d(int16_t *block) |
97 { | 97 { |
98 asm volatile( | 98 __asm__ volatile( |
99 "movq 112(%0), %%mm7 \n\t" | 99 "movq 112(%0), %%mm7 \n\t" |
100 "movq 80(%0), %%mm0 \n\t" | 100 "movq 80(%0), %%mm0 \n\t" |
101 "movq 48(%0), %%mm3 \n\t" | 101 "movq 48(%0), %%mm3 \n\t" |
102 "movq 16(%0), %%mm5 \n\t" | 102 "movq 16(%0), %%mm5 \n\t" |
103 | 103 |
164 for(i=0; i<2; i++){ | 164 for(i=0; i<2; i++){ |
165 DECLARE_ALIGNED_8(uint64_t, tmp); | 165 DECLARE_ALIGNED_8(uint64_t, tmp); |
166 | 166 |
167 h264_idct8_1d(block+4*i); | 167 h264_idct8_1d(block+4*i); |
168 | 168 |
169 asm volatile( | 169 __asm__ volatile( |
170 "movq %%mm7, %0 \n\t" | 170 "movq %%mm7, %0 \n\t" |
171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) | 171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) |
172 "movq %%mm0, 8(%1) \n\t" | 172 "movq %%mm0, 8(%1) \n\t" |
173 "movq %%mm6, 24(%1) \n\t" | 173 "movq %%mm6, 24(%1) \n\t" |
174 "movq %%mm7, 40(%1) \n\t" | 174 "movq %%mm7, 40(%1) \n\t" |
186 } | 186 } |
187 | 187 |
188 for(i=0; i<2; i++){ | 188 for(i=0; i<2; i++){ |
189 h264_idct8_1d(b2+4*i); | 189 h264_idct8_1d(b2+4*i); |
190 | 190 |
191 asm volatile( | 191 __asm__ volatile( |
192 "psraw $6, %%mm7 \n\t" | 192 "psraw $6, %%mm7 \n\t" |
193 "psraw $6, %%mm6 \n\t" | 193 "psraw $6, %%mm6 \n\t" |
194 "psraw $6, %%mm5 \n\t" | 194 "psraw $6, %%mm5 \n\t" |
195 "psraw $6, %%mm4 \n\t" | 195 "psraw $6, %%mm4 \n\t" |
196 "psraw $6, %%mm3 \n\t" | 196 "psraw $6, %%mm3 \n\t" |
267 SUMSUB_BA(h, a)\ | 267 SUMSUB_BA(h, a)\ |
268 SUMSUB_BA(d, f) | 268 SUMSUB_BA(d, f) |
269 | 269 |
270 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) | 270 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride) |
271 { | 271 { |
272 asm volatile( | 272 __asm__ volatile( |
273 "movdqa 0x10(%1), %%xmm1 \n" | 273 "movdqa 0x10(%1), %%xmm1 \n" |
274 "movdqa 0x20(%1), %%xmm2 \n" | 274 "movdqa 0x20(%1), %%xmm2 \n" |
275 "movdqa 0x30(%1), %%xmm3 \n" | 275 "movdqa 0x30(%1), %%xmm3 \n" |
276 "movdqa 0x50(%1), %%xmm5 \n" | 276 "movdqa 0x50(%1), %%xmm5 \n" |
277 "movdqa 0x60(%1), %%xmm6 \n" | 277 "movdqa 0x60(%1), %%xmm6 \n" |
302 } | 302 } |
303 | 303 |
304 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | 304 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) |
305 { | 305 { |
306 int dc = (block[0] + 32) >> 6; | 306 int dc = (block[0] + 32) >> 6; |
307 asm volatile( | 307 __asm__ volatile( |
308 "movd %0, %%mm0 \n\t" | 308 "movd %0, %%mm0 \n\t" |
309 "pshufw $0, %%mm0, %%mm0 \n\t" | 309 "pshufw $0, %%mm0, %%mm0 \n\t" |
310 "pxor %%mm1, %%mm1 \n\t" | 310 "pxor %%mm1, %%mm1 \n\t" |
311 "psubw %%mm0, %%mm1 \n\t" | 311 "psubw %%mm0, %%mm1 \n\t" |
312 "packuswb %%mm0, %%mm0 \n\t" | 312 "packuswb %%mm0, %%mm0 \n\t" |
313 "packuswb %%mm1, %%mm1 \n\t" | 313 "packuswb %%mm1, %%mm1 \n\t" |
314 ::"r"(dc) | 314 ::"r"(dc) |
315 ); | 315 ); |
316 asm volatile( | 316 __asm__ volatile( |
317 "movd %0, %%mm2 \n\t" | 317 "movd %0, %%mm2 \n\t" |
318 "movd %1, %%mm3 \n\t" | 318 "movd %1, %%mm3 \n\t" |
319 "movd %2, %%mm4 \n\t" | 319 "movd %2, %%mm4 \n\t" |
320 "movd %3, %%mm5 \n\t" | 320 "movd %3, %%mm5 \n\t" |
321 "paddusb %%mm0, %%mm2 \n\t" | 321 "paddusb %%mm0, %%mm2 \n\t" |
339 | 339 |
340 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) | 340 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) |
341 { | 341 { |
342 int dc = (block[0] + 32) >> 6; | 342 int dc = (block[0] + 32) >> 6; |
343 int y; | 343 int y; |
344 asm volatile( | 344 __asm__ volatile( |
345 "movd %0, %%mm0 \n\t" | 345 "movd %0, %%mm0 \n\t" |
346 "pshufw $0, %%mm0, %%mm0 \n\t" | 346 "pshufw $0, %%mm0, %%mm0 \n\t" |
347 "pxor %%mm1, %%mm1 \n\t" | 347 "pxor %%mm1, %%mm1 \n\t" |
348 "psubw %%mm0, %%mm1 \n\t" | 348 "psubw %%mm0, %%mm1 \n\t" |
349 "packuswb %%mm0, %%mm0 \n\t" | 349 "packuswb %%mm0, %%mm0 \n\t" |
350 "packuswb %%mm1, %%mm1 \n\t" | 350 "packuswb %%mm1, %%mm1 \n\t" |
351 ::"r"(dc) | 351 ::"r"(dc) |
352 ); | 352 ); |
353 for(y=2; y--; dst += 4*stride){ | 353 for(y=2; y--; dst += 4*stride){ |
354 asm volatile( | 354 __asm__ volatile( |
355 "movq %0, %%mm2 \n\t" | 355 "movq %0, %%mm2 \n\t" |
356 "movq %1, %%mm3 \n\t" | 356 "movq %1, %%mm3 \n\t" |
357 "movq %2, %%mm4 \n\t" | 357 "movq %2, %%mm4 \n\t" |
358 "movq %3, %%mm5 \n\t" | 358 "movq %3, %%mm5 \n\t" |
359 "paddusb %%mm0, %%mm2 \n\t" | 359 "paddusb %%mm0, %%mm2 \n\t" |
461 | 461 |
462 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) | 462 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) |
463 { | 463 { |
464 DECLARE_ALIGNED_8(uint64_t, tmp0[2]); | 464 DECLARE_ALIGNED_8(uint64_t, tmp0[2]); |
465 | 465 |
466 asm volatile( | 466 __asm__ volatile( |
467 "movq (%1,%3), %%mm0 \n\t" //p1 | 467 "movq (%1,%3), %%mm0 \n\t" //p1 |
468 "movq (%1,%3,2), %%mm1 \n\t" //p0 | 468 "movq (%1,%3,2), %%mm1 \n\t" //p0 |
469 "movq (%2), %%mm2 \n\t" //q0 | 469 "movq (%2), %%mm2 \n\t" //q0 |
470 "movq (%2,%3), %%mm3 \n\t" //q1 | 470 "movq (%2,%3), %%mm3 \n\t" //q1 |
471 H264_DEBLOCK_MASK(%6, %7) | 471 H264_DEBLOCK_MASK(%6, %7) |
538 } | 538 } |
539 } | 539 } |
540 | 540 |
541 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) | 541 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) |
542 { | 542 { |
543 asm volatile( | 543 __asm__ volatile( |
544 "movq (%0), %%mm0 \n\t" //p1 | 544 "movq (%0), %%mm0 \n\t" //p1 |
545 "movq (%0,%2), %%mm1 \n\t" //p0 | 545 "movq (%0,%2), %%mm1 \n\t" //p0 |
546 "movq (%1), %%mm2 \n\t" //q0 | 546 "movq (%1), %%mm2 \n\t" //q0 |
547 "movq (%1,%2), %%mm3 \n\t" //q1 | 547 "movq (%1,%2), %%mm3 \n\t" //q1 |
548 H264_DEBLOCK_MASK(%4, %5) | 548 H264_DEBLOCK_MASK(%4, %5) |
584 "psubusb %%mm4, "#p0" \n\t"\ | 584 "psubusb %%mm4, "#p0" \n\t"\ |
585 "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ | 585 "pavgb "#p1", "#p0" \n\t" /* dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) */\ |
586 | 586 |
587 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) | 587 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1) |
588 { | 588 { |
589 asm volatile( | 589 __asm__ volatile( |
590 "movq (%0), %%mm0 \n\t" | 590 "movq (%0), %%mm0 \n\t" |
591 "movq (%0,%2), %%mm1 \n\t" | 591 "movq (%0,%2), %%mm1 \n\t" |
592 "movq (%1), %%mm2 \n\t" | 592 "movq (%1), %%mm2 \n\t" |
593 "movq (%1,%2), %%mm3 \n\t" | 593 "movq (%1,%2), %%mm3 \n\t" |
594 H264_DEBLOCK_MASK(%3, %4) | 594 H264_DEBLOCK_MASK(%3, %4) |
626 } | 626 } |
627 | 627 |
628 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], | 628 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], |
629 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { | 629 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) { |
630 int dir; | 630 int dir; |
631 asm volatile( | 631 __asm__ volatile( |
632 "pxor %%mm7, %%mm7 \n\t" | 632 "pxor %%mm7, %%mm7 \n\t" |
633 "movq %0, %%mm6 \n\t" | 633 "movq %0, %%mm6 \n\t" |
634 "movq %1, %%mm5 \n\t" | 634 "movq %1, %%mm5 \n\t" |
635 "movq %2, %%mm4 \n\t" | 635 "movq %2, %%mm4 \n\t" |
636 ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) | 636 ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) |
637 ); | 637 ); |
638 if(field) | 638 if(field) |
639 asm volatile( | 639 __asm__ volatile( |
640 "movq %0, %%mm5 \n\t" | 640 "movq %0, %%mm5 \n\t" |
641 "movq %1, %%mm4 \n\t" | 641 "movq %1, %%mm4 \n\t" |
642 ::"m"(ff_pb_3_1), "m"(ff_pb_7_3) | 642 ::"m"(ff_pb_3_1), "m"(ff_pb_7_3) |
643 ); | 643 ); |
644 | 644 |
648 const int d_idx = dir ? -8 : -1; | 648 const int d_idx = dir ? -8 : -1; |
649 const int mask_mv = dir ? mask_mv1 : mask_mv0; | 649 const int mask_mv = dir ? mask_mv1 : mask_mv0; |
650 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; | 650 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; |
651 int b_idx, edge, l; | 651 int b_idx, edge, l; |
652 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { | 652 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { |
653 asm volatile( | 653 __asm__ volatile( |
654 "pand %0, %%mm0 \n\t" | 654 "pand %0, %%mm0 \n\t" |
655 ::"m"(mask_dir) | 655 ::"m"(mask_dir) |
656 ); | 656 ); |
657 if(!(mask_mv & edge)) { | 657 if(!(mask_mv & edge)) { |
658 asm volatile("pxor %%mm0, %%mm0 \n\t":); | 658 __asm__ volatile("pxor %%mm0, %%mm0 \n\t":); |
659 for( l = bidir; l >= 0; l-- ) { | 659 for( l = bidir; l >= 0; l-- ) { |
660 asm volatile( | 660 __asm__ volatile( |
661 "movd %0, %%mm1 \n\t" | 661 "movd %0, %%mm1 \n\t" |
662 "punpckldq %1, %%mm1 \n\t" | 662 "punpckldq %1, %%mm1 \n\t" |
663 "movq %%mm1, %%mm2 \n\t" | 663 "movq %%mm1, %%mm2 \n\t" |
664 "psrlw $7, %%mm2 \n\t" | 664 "psrlw $7, %%mm2 \n\t" |
665 "pand %%mm6, %%mm2 \n\t" | 665 "pand %%mm6, %%mm2 \n\t" |
686 "m"(mv[l][b_idx+d_idx][0]), | 686 "m"(mv[l][b_idx+d_idx][0]), |
687 "m"(mv[l][b_idx+d_idx+2][0]) | 687 "m"(mv[l][b_idx+d_idx+2][0]) |
688 ); | 688 ); |
689 } | 689 } |
690 } | 690 } |
691 asm volatile( | 691 __asm__ volatile( |
692 "movd %0, %%mm1 \n\t" | 692 "movd %0, %%mm1 \n\t" |
693 "por %1, %%mm1 \n\t" | 693 "por %1, %%mm1 \n\t" |
694 "punpcklbw %%mm7, %%mm1 \n\t" | 694 "punpcklbw %%mm7, %%mm1 \n\t" |
695 "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] | 695 "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] |
696 ::"m"(nnz[b_idx]), | 696 ::"m"(nnz[b_idx]), |
697 "m"(nnz[b_idx+d_idx]) | 697 "m"(nnz[b_idx+d_idx]) |
698 ); | 698 ); |
699 asm volatile( | 699 __asm__ volatile( |
700 "pcmpeqw %%mm7, %%mm0 \n\t" | 700 "pcmpeqw %%mm7, %%mm0 \n\t" |
701 "pcmpeqw %%mm7, %%mm0 \n\t" | 701 "pcmpeqw %%mm7, %%mm0 \n\t" |
702 "psrlw $15, %%mm0 \n\t" // nonzero -> 1 | 702 "psrlw $15, %%mm0 \n\t" // nonzero -> 1 |
703 "psrlw $14, %%mm1 \n\t" | 703 "psrlw $14, %%mm1 \n\t" |
704 "movq %%mm0, %%mm2 \n\t" | 704 "movq %%mm0, %%mm2 \n\t" |
711 ); | 711 ); |
712 } | 712 } |
713 edges = 4; | 713 edges = 4; |
714 step = 1; | 714 step = 1; |
715 } | 715 } |
716 asm volatile( | 716 __asm__ volatile( |
717 "movq (%0), %%mm0 \n\t" | 717 "movq (%0), %%mm0 \n\t" |
718 "movq 8(%0), %%mm1 \n\t" | 718 "movq 8(%0), %%mm1 \n\t" |
719 "movq 16(%0), %%mm2 \n\t" | 719 "movq 16(%0), %%mm2 \n\t" |
720 "movq 24(%0), %%mm3 \n\t" | 720 "movq 24(%0), %%mm3 \n\t" |
721 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) | 721 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) |
772 | 772 |
773 #define QPEL_H264(OPNAME, OP, MMX)\ | 773 #define QPEL_H264(OPNAME, OP, MMX)\ |
774 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 774 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
775 int h=4;\ | 775 int h=4;\ |
776 \ | 776 \ |
777 asm volatile(\ | 777 __asm__ volatile(\ |
778 "pxor %%mm7, %%mm7 \n\t"\ | 778 "pxor %%mm7, %%mm7 \n\t"\ |
779 "movq %5, %%mm4 \n\t"\ | 779 "movq %5, %%mm4 \n\t"\ |
780 "movq %6, %%mm5 \n\t"\ | 780 "movq %6, %%mm5 \n\t"\ |
781 "1: \n\t"\ | 781 "1: \n\t"\ |
782 "movd -1(%0), %%mm1 \n\t"\ | 782 "movd -1(%0), %%mm1 \n\t"\ |
811 : "memory"\ | 811 : "memory"\ |
812 );\ | 812 );\ |
813 }\ | 813 }\ |
814 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | 814 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ |
815 int h=4;\ | 815 int h=4;\ |
816 asm volatile(\ | 816 __asm__ volatile(\ |
817 "pxor %%mm7, %%mm7 \n\t"\ | 817 "pxor %%mm7, %%mm7 \n\t"\ |
818 "movq %0, %%mm4 \n\t"\ | 818 "movq %0, %%mm4 \n\t"\ |
819 "movq %1, %%mm5 \n\t"\ | 819 "movq %1, %%mm5 \n\t"\ |
820 :: "m"(ff_pw_5), "m"(ff_pw_16)\ | 820 :: "m"(ff_pw_5), "m"(ff_pw_16)\ |
821 );\ | 821 );\ |
822 do{\ | 822 do{\ |
823 asm volatile(\ | 823 __asm__ volatile(\ |
824 "movd -1(%0), %%mm1 \n\t"\ | 824 "movd -1(%0), %%mm1 \n\t"\ |
825 "movd (%0), %%mm2 \n\t"\ | 825 "movd (%0), %%mm2 \n\t"\ |
826 "movd 1(%0), %%mm3 \n\t"\ | 826 "movd 1(%0), %%mm3 \n\t"\ |
827 "movd 2(%0), %%mm0 \n\t"\ | 827 "movd 2(%0), %%mm0 \n\t"\ |
828 "punpcklbw %%mm7, %%mm1 \n\t"\ | 828 "punpcklbw %%mm7, %%mm1 \n\t"\ |
855 );\ | 855 );\ |
856 }while(--h);\ | 856 }while(--h);\ |
857 }\ | 857 }\ |
858 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 858 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
859 src -= 2*srcStride;\ | 859 src -= 2*srcStride;\ |
860 asm volatile(\ | 860 __asm__ volatile(\ |
861 "pxor %%mm7, %%mm7 \n\t"\ | 861 "pxor %%mm7, %%mm7 \n\t"\ |
862 "movd (%0), %%mm0 \n\t"\ | 862 "movd (%0), %%mm0 \n\t"\ |
863 "add %2, %0 \n\t"\ | 863 "add %2, %0 \n\t"\ |
864 "movd (%0), %%mm1 \n\t"\ | 864 "movd (%0), %%mm1 \n\t"\ |
865 "add %2, %0 \n\t"\ | 865 "add %2, %0 \n\t"\ |
887 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ | 887 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
888 int h=4;\ | 888 int h=4;\ |
889 int w=3;\ | 889 int w=3;\ |
890 src -= 2*srcStride+2;\ | 890 src -= 2*srcStride+2;\ |
891 while(w--){\ | 891 while(w--){\ |
892 asm volatile(\ | 892 __asm__ volatile(\ |
893 "pxor %%mm7, %%mm7 \n\t"\ | 893 "pxor %%mm7, %%mm7 \n\t"\ |
894 "movd (%0), %%mm0 \n\t"\ | 894 "movd (%0), %%mm0 \n\t"\ |
895 "add %2, %0 \n\t"\ | 895 "add %2, %0 \n\t"\ |
896 "movd (%0), %%mm1 \n\t"\ | 896 "movd (%0), %%mm1 \n\t"\ |
897 "add %2, %0 \n\t"\ | 897 "add %2, %0 \n\t"\ |
917 );\ | 917 );\ |
918 tmp += 4;\ | 918 tmp += 4;\ |
919 src += 4 - 9*srcStride;\ | 919 src += 4 - 9*srcStride;\ |
920 }\ | 920 }\ |
921 tmp -= 3*4;\ | 921 tmp -= 3*4;\ |
922 asm volatile(\ | 922 __asm__ volatile(\ |
923 "1: \n\t"\ | 923 "1: \n\t"\ |
924 "movq (%0), %%mm0 \n\t"\ | 924 "movq (%0), %%mm0 \n\t"\ |
925 "paddw 10(%0), %%mm0 \n\t"\ | 925 "paddw 10(%0), %%mm0 \n\t"\ |
926 "movq 2(%0), %%mm1 \n\t"\ | 926 "movq 2(%0), %%mm1 \n\t"\ |
927 "paddw 8(%0), %%mm1 \n\t"\ | 927 "paddw 8(%0), %%mm1 \n\t"\ |
946 );\ | 946 );\ |
947 }\ | 947 }\ |
948 \ | 948 \ |
949 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 949 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
950 int h=8;\ | 950 int h=8;\ |
951 asm volatile(\ | 951 __asm__ volatile(\ |
952 "pxor %%mm7, %%mm7 \n\t"\ | 952 "pxor %%mm7, %%mm7 \n\t"\ |
953 "movq %5, %%mm6 \n\t"\ | 953 "movq %5, %%mm6 \n\t"\ |
954 "1: \n\t"\ | 954 "1: \n\t"\ |
955 "movq (%0), %%mm0 \n\t"\ | 955 "movq (%0), %%mm0 \n\t"\ |
956 "movq 1(%0), %%mm2 \n\t"\ | 956 "movq 1(%0), %%mm2 \n\t"\ |
1003 );\ | 1003 );\ |
1004 }\ | 1004 }\ |
1005 \ | 1005 \ |
1006 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | 1006 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ |
1007 int h=8;\ | 1007 int h=8;\ |
1008 asm volatile(\ | 1008 __asm__ volatile(\ |
1009 "pxor %%mm7, %%mm7 \n\t"\ | 1009 "pxor %%mm7, %%mm7 \n\t"\ |
1010 "movq %0, %%mm6 \n\t"\ | 1010 "movq %0, %%mm6 \n\t"\ |
1011 :: "m"(ff_pw_5)\ | 1011 :: "m"(ff_pw_5)\ |
1012 );\ | 1012 );\ |
1013 do{\ | 1013 do{\ |
1014 asm volatile(\ | 1014 __asm__ volatile(\ |
1015 "movq (%0), %%mm0 \n\t"\ | 1015 "movq (%0), %%mm0 \n\t"\ |
1016 "movq 1(%0), %%mm2 \n\t"\ | 1016 "movq 1(%0), %%mm2 \n\t"\ |
1017 "movq %%mm0, %%mm1 \n\t"\ | 1017 "movq %%mm0, %%mm1 \n\t"\ |
1018 "movq %%mm2, %%mm3 \n\t"\ | 1018 "movq %%mm2, %%mm3 \n\t"\ |
1019 "punpcklbw %%mm7, %%mm0 \n\t"\ | 1019 "punpcklbw %%mm7, %%mm0 \n\t"\ |
1069 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | 1069 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
1070 int w= 2;\ | 1070 int w= 2;\ |
1071 src -= 2*srcStride;\ | 1071 src -= 2*srcStride;\ |
1072 \ | 1072 \ |
1073 while(w--){\ | 1073 while(w--){\ |
1074 asm volatile(\ | 1074 __asm__ volatile(\ |
1075 "pxor %%mm7, %%mm7 \n\t"\ | 1075 "pxor %%mm7, %%mm7 \n\t"\ |
1076 "movd (%0), %%mm0 \n\t"\ | 1076 "movd (%0), %%mm0 \n\t"\ |
1077 "add %2, %0 \n\t"\ | 1077 "add %2, %0 \n\t"\ |
1078 "movd (%0), %%mm1 \n\t"\ | 1078 "movd (%0), %%mm1 \n\t"\ |
1079 "add %2, %0 \n\t"\ | 1079 "add %2, %0 \n\t"\ |
1100 : "+a"(src), "+c"(dst)\ | 1100 : "+a"(src), "+c"(dst)\ |
1101 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 1101 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
1102 : "memory"\ | 1102 : "memory"\ |
1103 );\ | 1103 );\ |
1104 if(h==16){\ | 1104 if(h==16){\ |
1105 asm volatile(\ | 1105 __asm__ volatile(\ |
1106 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ | 1106 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\ |
1107 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ | 1107 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\ |
1108 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ | 1108 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\ |
1109 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ | 1109 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\ |
1110 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ | 1110 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\ |
1123 }\ | 1123 }\ |
1124 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ | 1124 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ |
1125 int w = (size+8)>>2;\ | 1125 int w = (size+8)>>2;\ |
1126 src -= 2*srcStride+2;\ | 1126 src -= 2*srcStride+2;\ |
1127 while(w--){\ | 1127 while(w--){\ |
1128 asm volatile(\ | 1128 __asm__ volatile(\ |
1129 "pxor %%mm7, %%mm7 \n\t"\ | 1129 "pxor %%mm7, %%mm7 \n\t"\ |
1130 "movd (%0), %%mm0 \n\t"\ | 1130 "movd (%0), %%mm0 \n\t"\ |
1131 "add %2, %0 \n\t"\ | 1131 "add %2, %0 \n\t"\ |
1132 "movd (%0), %%mm1 \n\t"\ | 1132 "movd (%0), %%mm1 \n\t"\ |
1133 "add %2, %0 \n\t"\ | 1133 "add %2, %0 \n\t"\ |
1153 : "+a"(src)\ | 1153 : "+a"(src)\ |
1154 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 1154 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
1155 : "memory"\ | 1155 : "memory"\ |
1156 );\ | 1156 );\ |
1157 if(size==16){\ | 1157 if(size==16){\ |
1158 asm volatile(\ | 1158 __asm__ volatile(\ |
1159 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ | 1159 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\ |
1160 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ | 1160 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\ |
1161 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ | 1161 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\ |
1162 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ | 1162 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\ |
1163 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ | 1163 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\ |
1175 }\ | 1175 }\ |
1176 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ | 1176 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ |
1177 int w = size>>4;\ | 1177 int w = size>>4;\ |
1178 do{\ | 1178 do{\ |
1179 int h = size;\ | 1179 int h = size;\ |
1180 asm volatile(\ | 1180 __asm__ volatile(\ |
1181 "1: \n\t"\ | 1181 "1: \n\t"\ |
1182 "movq (%0), %%mm0 \n\t"\ | 1182 "movq (%0), %%mm0 \n\t"\ |
1183 "movq 8(%0), %%mm3 \n\t"\ | 1183 "movq 8(%0), %%mm3 \n\t"\ |
1184 "movq 2(%0), %%mm1 \n\t"\ | 1184 "movq 2(%0), %%mm1 \n\t"\ |
1185 "movq 10(%0), %%mm4 \n\t"\ | 1185 "movq 10(%0), %%mm4 \n\t"\ |
1259 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ | 1259 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ |
1260 }\ | 1260 }\ |
1261 \ | 1261 \ |
1262 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ | 1262 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ |
1263 {\ | 1263 {\ |
1264 asm volatile(\ | 1264 __asm__ volatile(\ |
1265 "movq (%1), %%mm0 \n\t"\ | 1265 "movq (%1), %%mm0 \n\t"\ |
1266 "movq 24(%1), %%mm1 \n\t"\ | 1266 "movq 24(%1), %%mm1 \n\t"\ |
1267 "psraw $5, %%mm0 \n\t"\ | 1267 "psraw $5, %%mm0 \n\t"\ |
1268 "psraw $5, %%mm1 \n\t"\ | 1268 "psraw $5, %%mm1 \n\t"\ |
1269 "packuswb %%mm0, %%mm0 \n\t"\ | 1269 "packuswb %%mm0, %%mm0 \n\t"\ |
1289 :"memory");\ | 1289 :"memory");\ |
1290 }\ | 1290 }\ |
1291 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ | 1291 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ |
1292 {\ | 1292 {\ |
1293 do{\ | 1293 do{\ |
1294 asm volatile(\ | 1294 __asm__ volatile(\ |
1295 "movq (%1), %%mm0 \n\t"\ | 1295 "movq (%1), %%mm0 \n\t"\ |
1296 "movq 8(%1), %%mm1 \n\t"\ | 1296 "movq 8(%1), %%mm1 \n\t"\ |
1297 "movq 48(%1), %%mm2 \n\t"\ | 1297 "movq 48(%1), %%mm2 \n\t"\ |
1298 "movq 8+48(%1), %%mm3 \n\t"\ | 1298 "movq 8+48(%1), %%mm3 \n\t"\ |
1299 "psraw $5, %%mm0 \n\t"\ | 1299 "psraw $5, %%mm0 \n\t"\ |
1323 | 1323 |
1324 #ifdef ARCH_X86_64 | 1324 #ifdef ARCH_X86_64 |
1325 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ | 1325 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ |
1326 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | 1326 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ |
1327 int h=16;\ | 1327 int h=16;\ |
1328 asm volatile(\ | 1328 __asm__ volatile(\ |
1329 "pxor %%xmm15, %%xmm15 \n\t"\ | 1329 "pxor %%xmm15, %%xmm15 \n\t"\ |
1330 "movdqa %6, %%xmm14 \n\t"\ | 1330 "movdqa %6, %%xmm14 \n\t"\ |
1331 "movdqa %7, %%xmm13 \n\t"\ | 1331 "movdqa %7, %%xmm13 \n\t"\ |
1332 "1: \n\t"\ | 1332 "1: \n\t"\ |
1333 "lddqu 3(%0), %%xmm1 \n\t"\ | 1333 "lddqu 3(%0), %%xmm1 \n\t"\ |
1401 #endif // ARCH_X86_64 | 1401 #endif // ARCH_X86_64 |
1402 | 1402 |
1403 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ | 1403 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ |
1404 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ | 1404 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ |
1405 int h=8;\ | 1405 int h=8;\ |
1406 asm volatile(\ | 1406 __asm__ volatile(\ |
1407 "pxor %%xmm7, %%xmm7 \n\t"\ | 1407 "pxor %%xmm7, %%xmm7 \n\t"\ |
1408 "movdqa %0, %%xmm6 \n\t"\ | 1408 "movdqa %0, %%xmm6 \n\t"\ |
1409 :: "m"(ff_pw_5)\ | 1409 :: "m"(ff_pw_5)\ |
1410 );\ | 1410 );\ |
1411 do{\ | 1411 do{\ |
1412 asm volatile(\ | 1412 __asm__ volatile(\ |
1413 "lddqu -5(%0), %%xmm1 \n\t"\ | 1413 "lddqu -5(%0), %%xmm1 \n\t"\ |
1414 "movdqa %%xmm1, %%xmm0 \n\t"\ | 1414 "movdqa %%xmm1, %%xmm0 \n\t"\ |
1415 "punpckhbw %%xmm7, %%xmm1 \n\t"\ | 1415 "punpckhbw %%xmm7, %%xmm1 \n\t"\ |
1416 "punpcklbw %%xmm7, %%xmm0 \n\t"\ | 1416 "punpcklbw %%xmm7, %%xmm0 \n\t"\ |
1417 "movdqa %%xmm1, %%xmm2 \n\t"\ | 1417 "movdqa %%xmm1, %%xmm2 \n\t"\ |
1448 }\ | 1448 }\ |
1449 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ | 1449 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ |
1450 \ | 1450 \ |
1451 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ | 1451 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
1452 int h=8;\ | 1452 int h=8;\ |
1453 asm volatile(\ | 1453 __asm__ volatile(\ |
1454 "pxor %%xmm7, %%xmm7 \n\t"\ | 1454 "pxor %%xmm7, %%xmm7 \n\t"\ |
1455 "movdqa %5, %%xmm6 \n\t"\ | 1455 "movdqa %5, %%xmm6 \n\t"\ |
1456 "1: \n\t"\ | 1456 "1: \n\t"\ |
1457 "lddqu -5(%0), %%xmm1 \n\t"\ | 1457 "lddqu -5(%0), %%xmm1 \n\t"\ |
1458 "movdqa %%xmm1, %%xmm0 \n\t"\ | 1458 "movdqa %%xmm1, %%xmm0 \n\t"\ |
1499 | 1499 |
1500 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ | 1500 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ |
1501 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ | 1501 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
1502 src -= 2*srcStride;\ | 1502 src -= 2*srcStride;\ |
1503 \ | 1503 \ |
1504 asm volatile(\ | 1504 __asm__ volatile(\ |
1505 "pxor %%xmm7, %%xmm7 \n\t"\ | 1505 "pxor %%xmm7, %%xmm7 \n\t"\ |
1506 "movq (%0), %%xmm0 \n\t"\ | 1506 "movq (%0), %%xmm0 \n\t"\ |
1507 "add %2, %0 \n\t"\ | 1507 "add %2, %0 \n\t"\ |
1508 "movq (%0), %%xmm1 \n\t"\ | 1508 "movq (%0), %%xmm1 \n\t"\ |
1509 "add %2, %0 \n\t"\ | 1509 "add %2, %0 \n\t"\ |
1530 : "+a"(src), "+c"(dst)\ | 1530 : "+a"(src), "+c"(dst)\ |
1531 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ | 1531 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ |
1532 : "memory"\ | 1532 : "memory"\ |
1533 );\ | 1533 );\ |
1534 if(h==16){\ | 1534 if(h==16){\ |
1535 asm volatile(\ | 1535 __asm__ volatile(\ |
1536 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ | 1536 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\ |
1537 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ | 1537 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\ |
1538 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ | 1538 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\ |
1539 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ | 1539 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\ |
1540 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ | 1540 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\ |
1558 | 1558 |
1559 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ | 1559 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){ |
1560 int w = (size+8)>>3; | 1560 int w = (size+8)>>3; |
1561 src -= 2*srcStride+2; | 1561 src -= 2*srcStride+2; |
1562 while(w--){ | 1562 while(w--){ |
1563 asm volatile( | 1563 __asm__ volatile( |
1564 "pxor %%xmm7, %%xmm7 \n\t" | 1564 "pxor %%xmm7, %%xmm7 \n\t" |
1565 "movq (%0), %%xmm0 \n\t" | 1565 "movq (%0), %%xmm0 \n\t" |
1566 "add %2, %0 \n\t" | 1566 "add %2, %0 \n\t" |
1567 "movq (%0), %%xmm1 \n\t" | 1567 "movq (%0), %%xmm1 \n\t" |
1568 "add %2, %0 \n\t" | 1568 "add %2, %0 \n\t" |
1588 : "+a"(src) | 1588 : "+a"(src) |
1589 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) | 1589 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16) |
1590 : "memory" | 1590 : "memory" |
1591 ); | 1591 ); |
1592 if(size==16){ | 1592 if(size==16){ |
1593 asm volatile( | 1593 __asm__ volatile( |
1594 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) | 1594 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48) |
1595 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) | 1595 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48) |
1596 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) | 1596 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48) |
1597 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) | 1597 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48) |
1598 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) | 1598 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48) |
1611 | 1611 |
1612 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ | 1612 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\ |
1613 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ | 1613 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ |
1614 int h = size;\ | 1614 int h = size;\ |
1615 if(size == 16){\ | 1615 if(size == 16){\ |
1616 asm volatile(\ | 1616 __asm__ volatile(\ |
1617 "1: \n\t"\ | 1617 "1: \n\t"\ |
1618 "movdqa 32(%0), %%xmm4 \n\t"\ | 1618 "movdqa 32(%0), %%xmm4 \n\t"\ |
1619 "movdqa 16(%0), %%xmm5 \n\t"\ | 1619 "movdqa 16(%0), %%xmm5 \n\t"\ |
1620 "movdqa (%0), %%xmm7 \n\t"\ | 1620 "movdqa (%0), %%xmm7 \n\t"\ |
1621 "movdqa %%xmm4, %%xmm3 \n\t"\ | 1621 "movdqa %%xmm4, %%xmm3 \n\t"\ |
1666 : "+a"(tmp), "+c"(dst), "+g"(h)\ | 1666 : "+a"(tmp), "+c"(dst), "+g"(h)\ |
1667 : "S"((x86_reg)dstStride)\ | 1667 : "S"((x86_reg)dstStride)\ |
1668 : "memory"\ | 1668 : "memory"\ |
1669 );\ | 1669 );\ |
1670 }else{\ | 1670 }else{\ |
1671 asm volatile(\ | 1671 __asm__ volatile(\ |
1672 "1: \n\t"\ | 1672 "1: \n\t"\ |
1673 "movdqa 16(%0), %%xmm1 \n\t"\ | 1673 "movdqa 16(%0), %%xmm1 \n\t"\ |
1674 "movdqa (%0), %%xmm0 \n\t"\ | 1674 "movdqa (%0), %%xmm0 \n\t"\ |
1675 "movdqa %%xmm1, %%xmm2 \n\t"\ | 1675 "movdqa %%xmm1, %%xmm2 \n\t"\ |
1676 "movdqa %%xmm1, %%xmm3 \n\t"\ | 1676 "movdqa %%xmm1, %%xmm3 \n\t"\ |
2020 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) | 2020 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) |
2021 { | 2021 { |
2022 int x, y; | 2022 int x, y; |
2023 offset <<= log2_denom; | 2023 offset <<= log2_denom; |
2024 offset += (1 << log2_denom) >> 1; | 2024 offset += (1 << log2_denom) >> 1; |
2025 asm volatile( | 2025 __asm__ volatile( |
2026 "movd %0, %%mm4 \n\t" | 2026 "movd %0, %%mm4 \n\t" |
2027 "movd %1, %%mm5 \n\t" | 2027 "movd %1, %%mm5 \n\t" |
2028 "movd %2, %%mm6 \n\t" | 2028 "movd %2, %%mm6 \n\t" |
2029 "pshufw $0, %%mm4, %%mm4 \n\t" | 2029 "pshufw $0, %%mm4, %%mm4 \n\t" |
2030 "pshufw $0, %%mm5, %%mm5 \n\t" | 2030 "pshufw $0, %%mm5, %%mm5 \n\t" |
2031 "pxor %%mm7, %%mm7 \n\t" | 2031 "pxor %%mm7, %%mm7 \n\t" |
2032 :: "g"(weight), "g"(offset), "g"(log2_denom) | 2032 :: "g"(weight), "g"(offset), "g"(log2_denom) |
2033 ); | 2033 ); |
2034 for(y=0; y<h; y+=2){ | 2034 for(y=0; y<h; y+=2){ |
2035 for(x=0; x<w; x+=4){ | 2035 for(x=0; x<w; x+=4){ |
2036 asm volatile( | 2036 __asm__ volatile( |
2037 "movd %0, %%mm0 \n\t" | 2037 "movd %0, %%mm0 \n\t" |
2038 "movd %1, %%mm1 \n\t" | 2038 "movd %1, %%mm1 \n\t" |
2039 "punpcklbw %%mm7, %%mm0 \n\t" | 2039 "punpcklbw %%mm7, %%mm0 \n\t" |
2040 "punpcklbw %%mm7, %%mm1 \n\t" | 2040 "punpcklbw %%mm7, %%mm1 \n\t" |
2041 "pmullw %%mm4, %%mm0 \n\t" | 2041 "pmullw %%mm4, %%mm0 \n\t" |
2058 | 2058 |
2059 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) | 2059 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) |
2060 { | 2060 { |
2061 int x, y; | 2061 int x, y; |
2062 offset = ((offset + 1) | 1) << log2_denom; | 2062 offset = ((offset + 1) | 1) << log2_denom; |
2063 asm volatile( | 2063 __asm__ volatile( |
2064 "movd %0, %%mm3 \n\t" | 2064 "movd %0, %%mm3 \n\t" |
2065 "movd %1, %%mm4 \n\t" | 2065 "movd %1, %%mm4 \n\t" |
2066 "movd %2, %%mm5 \n\t" | 2066 "movd %2, %%mm5 \n\t" |
2067 "movd %3, %%mm6 \n\t" | 2067 "movd %3, %%mm6 \n\t" |
2068 "pshufw $0, %%mm3, %%mm3 \n\t" | 2068 "pshufw $0, %%mm3, %%mm3 \n\t" |
2071 "pxor %%mm7, %%mm7 \n\t" | 2071 "pxor %%mm7, %%mm7 \n\t" |
2072 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) | 2072 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) |
2073 ); | 2073 ); |
2074 for(y=0; y<h; y++){ | 2074 for(y=0; y<h; y++){ |
2075 for(x=0; x<w; x+=4){ | 2075 for(x=0; x<w; x+=4){ |
2076 asm volatile( | 2076 __asm__ volatile( |
2077 "movd %0, %%mm0 \n\t" | 2077 "movd %0, %%mm0 \n\t" |
2078 "movd %1, %%mm1 \n\t" | 2078 "movd %1, %%mm1 \n\t" |
2079 "punpcklbw %%mm7, %%mm0 \n\t" | 2079 "punpcklbw %%mm7, %%mm0 \n\t" |
2080 "punpcklbw %%mm7, %%mm1 \n\t" | 2080 "punpcklbw %%mm7, %%mm1 \n\t" |
2081 "pmullw %%mm3, %%mm0 \n\t" | 2081 "pmullw %%mm3, %%mm0 \n\t" |