comparison x86/h264dsp_mmx.c @ 11369:98970e51365a libavcodec

Remove DECLARE_ALIGNED_{8,16} macros These macros are redundant. All uses are replaced with the generic DECLARE_ALIGNED macro instead.
author mru
date Sat, 06 Mar 2010 14:24:59 +0000
parents aa10bb3c244c
children 731050abce41
comparison
equal deleted inserted replaced
11368:3d4f64b8fb10 11369:98970e51365a
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */ 19 */
20 20
21 #include "dsputil_mmx.h" 21 #include "dsputil_mmx.h"
22 22
23 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL; 23 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
24 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL; 24 DECLARE_ALIGNED(8, static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
25 25
26 /***********************************/ 26 /***********************************/
27 /* IDCT */ 27 /* IDCT */
28 28
29 #define SUMSUB_BADC( a, b, c, d ) \ 29 #define SUMSUB_BADC( a, b, c, d ) \
155 } 155 }
156 156
157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) 157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
158 { 158 {
159 int i; 159 int i;
160 DECLARE_ALIGNED_8(int16_t, b2)[64]; 160 DECLARE_ALIGNED(8, int16_t, b2)[64];
161 161
162 block[0] += 32; 162 block[0] += 32;
163 163
164 for(i=0; i<2; i++){ 164 for(i=0; i<2; i++){
165 DECLARE_ALIGNED_8(uint64_t, tmp); 165 DECLARE_ALIGNED(8, uint64_t, tmp);
166 166
167 h264_idct8_1d(block+4*i); 167 h264_idct8_1d(block+4*i);
168 168
169 __asm__ volatile( 169 __asm__ volatile(
170 "movq %%mm7, %0 \n\t" 170 "movq %%mm7, %0 \n\t"
626 "pminub "#tc0", "#q2" \n\t"\ 626 "pminub "#tc0", "#q2" \n\t"\
627 "movq "#q2", "q1addr" \n\t" 627 "movq "#q2", "q1addr" \n\t"
628 628
629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) 629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
630 { 630 {
631 DECLARE_ALIGNED_8(uint64_t, tmp0)[2]; 631 DECLARE_ALIGNED(8, uint64_t, tmp0)[2];
632 632
633 __asm__ volatile( 633 __asm__ volatile(
634 "movq (%2,%4), %%mm0 \n\t" //p1 634 "movq (%2,%4), %%mm0 \n\t" //p1
635 "movq (%2,%4,2), %%mm1 \n\t" //p0 635 "movq (%2,%4,2), %%mm1 \n\t" //p0
636 "movq (%3), %%mm2 \n\t" //q0 636 "movq (%3), %%mm2 \n\t" //q0
688 } 688 }
689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
690 { 690 {
691 //FIXME: could cut some load/stores by merging transpose with filter 691 //FIXME: could cut some load/stores by merging transpose with filter
692 // also, it only needs to transpose 6x8 692 // also, it only needs to transpose 6x8
693 DECLARE_ALIGNED_8(uint8_t, trans)[8*8]; 693 DECLARE_ALIGNED(8, uint8_t, trans)[8*8];
694 int i; 694 int i;
695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { 695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
696 if((tc0[0] & tc0[1]) < 0) 696 if((tc0[0] & tc0[1]) < 0)
697 continue; 697 continue;
698 transpose4x4(trans, pix-4, 8, stride); 698 transpose4x4(trans, pix-4, 8, stride);
732 } 732 }
733 733
734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) 734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
735 { 735 {
736 //FIXME: could cut some load/stores by merging transpose with filter 736 //FIXME: could cut some load/stores by merging transpose with filter
737 DECLARE_ALIGNED_8(uint8_t, trans)[8*4]; 737 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
738 transpose4x4(trans, pix-2, 8, stride); 738 transpose4x4(trans, pix-2, 8, stride);
739 transpose4x4(trans+4, pix-2+4*stride, 8, stride); 739 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); 740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
741 transpose4x4(pix-2, trans, stride, 8); 741 transpose4x4(pix-2, trans, stride, 8);
742 transpose4x4(pix-2+4*stride, trans+4, stride, 8); 742 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
782 } 782 }
783 783
784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) 784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
785 { 785 {
786 //FIXME: could cut some load/stores by merging transpose with filter 786 //FIXME: could cut some load/stores by merging transpose with filter
787 DECLARE_ALIGNED_8(uint8_t, trans)[8*4]; 787 DECLARE_ALIGNED(8, uint8_t, trans)[8*4];
788 transpose4x4(trans, pix-2, 8, stride); 788 transpose4x4(trans, pix-2, 8, stride);
789 transpose4x4(trans+4, pix-2+4*stride, 8, stride); 789 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); 790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
791 transpose4x4(pix-2, trans, stride, 8); 791 transpose4x4(pix-2, trans, stride, 8);
792 transpose4x4(pix-2+4*stride, trans+4, stride, 8); 792 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
813 // could do a special case for dir==0 && edges==1, but it only reduces the 813 // could do a special case for dir==0 && edges==1, but it only reduces the
814 // average filter time by 1.2% 814 // average filter time by 1.2%
815 for( dir=1; dir>=0; dir-- ) { 815 for( dir=1; dir>=0; dir-- ) {
816 const x86_reg d_idx = dir ? -8 : -1; 816 const x86_reg d_idx = dir ? -8 : -1;
817 const int mask_mv = dir ? mask_mv1 : mask_mv0; 817 const int mask_mv = dir ? mask_mv1 : mask_mv0;
818 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; 818 DECLARE_ALIGNED(8, const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
819 int b_idx, edge; 819 int b_idx, edge;
820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { 820 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
821 __asm__ volatile( 821 __asm__ volatile(
822 "pand %0, %%mm0 \n\t" 822 "pand %0, %%mm0 \n\t"
823 ::"m"(mask_dir) 823 ::"m"(mask_dir)
2104 H264_MC_816(H264_MC_H, ssse3) 2104 H264_MC_816(H264_MC_H, ssse3)
2105 H264_MC_816(H264_MC_HV, ssse3) 2105 H264_MC_816(H264_MC_HV, ssse3)
2106 #endif 2106 #endif
2107 2107
2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */ 2108 /* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
2109 DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg)[4] = { 2109 DECLARE_ALIGNED(8, static const uint64_t, h264_rnd_reg)[4] = {
2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL 2110 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
2111 }; 2111 };
2112 2112
2113 #define H264_CHROMA_OP(S,D) 2113 #define H264_CHROMA_OP(S,D)
2114 #define H264_CHROMA_OP4(S,D,T) 2114 #define H264_CHROMA_OP4(S,D,T)