comparison x86/h264dsp_mmx.c @ 12451:4c3e6ff1237e libavcodec

Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square biweight code to sse2/ssse3; add sse2 weight code; and use that same code to create mmx2 functions also, so that the inline asm in h264dsp_mmx.c can be removed. OK'ed by Jason on IRC.
author rbultje
date Wed, 01 Sep 2010 20:56:16 +0000
parents 3941687b4fa9
children f4355cd85faa
comparison
equal deleted inserted replaced
12450:3941687b4fa9 12451:4c3e6ff1237e
919 } 919 }
920 920
921 /***********************************/ 921 /***********************************/
922 /* weighted prediction */ 922 /* weighted prediction */
923 923
924 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) 924 #define H264_WEIGHT(W, H, OPT) \
925 { 925 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
926 int x, y; 926 int stride, int log2_denom, int weight, int offset);
927 offset <<= log2_denom; 927
928 offset += (1 << log2_denom) >> 1; 928 #define H264_BIWEIGHT(W, H, OPT) \
929 __asm__ volatile( 929 void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
930 "movd %0, %%mm4 \n\t" 930 uint8_t *src, int stride, int log2_denom, int weightd, \
931 "movd %1, %%mm5 \n\t" 931 int weights, int offset);
932 "movd %2, %%mm6 \n\t" 932
933 "pshufw $0, %%mm4, %%mm4 \n\t" 933 #define H264_BIWEIGHT_MMX(W,H) \
934 "pshufw $0, %%mm5, %%mm5 \n\t" 934 H264_WEIGHT (W, H, mmx2) \
935 "pxor %%mm7, %%mm7 \n\t" 935 H264_BIWEIGHT(W, H, mmx2)
936 :: "g"(weight), "g"(offset), "g"(log2_denom) 936
937 ); 937 #define H264_BIWEIGHT_MMX_SSE(W,H) \
938 for(y=0; y<h; y+=2){ 938 H264_BIWEIGHT_MMX(W, H) \
939 for(x=0; x<w; x+=4){ 939 H264_WEIGHT (W, H, sse2) \
940 __asm__ volatile( 940 H264_BIWEIGHT (W, H, sse2) \
941 "movd %0, %%mm0 \n\t" 941 H264_BIWEIGHT (W, H, ssse3)
942 "movd %1, %%mm1 \n\t" 942
943 "punpcklbw %%mm7, %%mm0 \n\t" 943 H264_BIWEIGHT_MMX_SSE(16, 16)
944 "punpcklbw %%mm7, %%mm1 \n\t" 944 H264_BIWEIGHT_MMX_SSE(16, 8)
945 "pmullw %%mm4, %%mm0 \n\t" 945 H264_BIWEIGHT_MMX_SSE( 8, 16)
946 "pmullw %%mm4, %%mm1 \n\t" 946 H264_BIWEIGHT_MMX_SSE( 8, 8)
947 "paddsw %%mm5, %%mm0 \n\t" 947 H264_BIWEIGHT_MMX_SSE( 8, 4)
948 "paddsw %%mm5, %%mm1 \n\t" 948 H264_BIWEIGHT_MMX ( 4, 8)
949 "psraw %%mm6, %%mm0 \n\t" 949 H264_BIWEIGHT_MMX ( 4, 4)
950 "psraw %%mm6, %%mm1 \n\t" 950 H264_BIWEIGHT_MMX ( 4, 2)
951 "packuswb %%mm7, %%mm0 \n\t"
952 "packuswb %%mm7, %%mm1 \n\t"
953 "movd %%mm0, %0 \n\t"
954 "movd %%mm1, %1 \n\t"
955 : "+m"(*(uint32_t*)(dst+x)),
956 "+m"(*(uint32_t*)(dst+x+stride))
957 );
958 }
959 dst += 2*stride;
960 }
961 }
962
963 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
964 {
965 int x, y;
966 offset = ((offset + 1) | 1) << log2_denom;
967 __asm__ volatile(
968 "movd %0, %%mm3 \n\t"
969 "movd %1, %%mm4 \n\t"
970 "movd %2, %%mm5 \n\t"
971 "movd %3, %%mm6 \n\t"
972 "pshufw $0, %%mm3, %%mm3 \n\t"
973 "pshufw $0, %%mm4, %%mm4 \n\t"
974 "pshufw $0, %%mm5, %%mm5 \n\t"
975 "pxor %%mm7, %%mm7 \n\t"
976 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
977 );
978 for(y=0; y<h; y++){
979 for(x=0; x<w; x+=4){
980 __asm__ volatile(
981 "movd %0, %%mm0 \n\t"
982 "movd %1, %%mm1 \n\t"
983 "punpcklbw %%mm7, %%mm0 \n\t"
984 "punpcklbw %%mm7, %%mm1 \n\t"
985 "pmullw %%mm3, %%mm0 \n\t"
986 "pmullw %%mm4, %%mm1 \n\t"
987 "paddsw %%mm1, %%mm0 \n\t"
988 "paddsw %%mm5, %%mm0 \n\t"
989 "psraw %%mm6, %%mm0 \n\t"
990 "packuswb %%mm0, %%mm0 \n\t"
991 "movd %%mm0, %0 \n\t"
992 : "+m"(*(uint32_t*)(dst+x))
993 : "m"(*(uint32_t*)(src+x))
994 );
995 }
996 src += stride;
997 dst += stride;
998 }
999 }
1000
1001 #define H264_WEIGHT(W,H) \
1002 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
1003 ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
1004 } \
1005 static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
1006 ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
1007 }
1008
1009 H264_WEIGHT(16,16)
1010 H264_WEIGHT(16, 8)
1011 H264_WEIGHT( 8,16)
1012 H264_WEIGHT( 8, 8)
1013 H264_WEIGHT( 8, 4)
1014 H264_WEIGHT( 4, 8)
1015 H264_WEIGHT( 4, 4)
1016 H264_WEIGHT( 4, 2)
1017
1018 void ff_h264_biweight_8x8_sse2(uint8_t *dst, uint8_t *src, int stride,
1019 int log2_denom, int weightd, int weights,
1020 int offset);
1021
1022 void ff_h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
1023 int log2_denom, int weightd, int weights,
1024 int offset);
1025
1026 void ff_h264_biweight_8x8_ssse3(uint8_t *dst, uint8_t *src, int stride,
1027 int log2_denom, int weightd, int weights,
1028 int offset);
1029
1030 void ff_h264_biweight_16x16_ssse3(uint8_t *dst, uint8_t *src, int stride,
1031 int log2_denom, int weightd, int weights,
1032 int offset);
1033 951
1034 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 952 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
1035 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); 953 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
1036 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); 954 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta);
1037 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); 955 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta);
1074 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; 992 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
1075 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; 993 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
1076 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; 994 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
1077 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; 995 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
1078 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; 996 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
1079 997 }
998 if(mm_flags & FF_MM_SSE2){
999 c->h264_idct8_add = ff_h264_idct8_add_sse2;
1000 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
1001 }
1002
1003 #if HAVE_YASM
1004 if (mm_flags & FF_MM_MMX2){
1005 #if ARCH_X86_32
1006 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
1007 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
1008 #endif
1080 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; 1009 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
1081 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; 1010 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
1082 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; 1011 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
1083 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; 1012 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
1084 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; 1013 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
1092 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; 1021 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
1093 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; 1022 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
1094 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; 1023 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
1095 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; 1024 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
1096 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; 1025 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
1097 } 1026
1098 if(mm_flags & FF_MM_SSE2){
1099 c->h264_idct8_add = ff_h264_idct8_add_sse2;
1100 c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
1101 }
1102
1103 #if HAVE_YASM
1104 if (mm_flags & FF_MM_MMX2){
1105 #if ARCH_X86_32
1106 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
1107 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
1108 #endif
1109 if( mm_flags&FF_MM_SSE2 ){ 1027 if( mm_flags&FF_MM_SSE2 ){
1028 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
1029 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
1030 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
1031 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
1032 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
1033
1110 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; 1034 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
1035 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
1036 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
1111 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; 1037 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
1038 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
1039
1112 #if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110 1040 #if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110
1113 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; 1041 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
1114 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; 1042 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
1115 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; 1043 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
1116 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; 1044 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
1121 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; 1049 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
1122 #endif 1050 #endif
1123 } 1051 }
1124 if ( mm_flags&FF_MM_SSSE3 ){ 1052 if ( mm_flags&FF_MM_SSSE3 ){
1125 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; 1053 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
1054 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
1055 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
1126 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; 1056 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
1057 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
1127 } 1058 }
1128 } 1059 }
1129 #endif 1060 #endif
1130 } 1061 }
1131 } 1062 }