Mercurial > libavcodec.hg
comparison x86/h264dsp_mmx.c @ 12451:4c3e6ff1237e libavcodec
Rename h264_weight_sse2.asm to h264_weight.asm; add 16x8/8x16/8x4 non-square
biweight code to sse2/ssse3; add sse2 weight code; and use that same code to
create mmx2 functions also, so that the inline asm in h264dsp_mmx.c can be
removed. OK'ed by Jason on IRC.
author | rbultje |
---|---|
date | Wed, 01 Sep 2010 20:56:16 +0000 |
parents | 3941687b4fa9 |
children | f4355cd85faa |
comparison
equal
deleted
inserted
replaced
12450:3941687b4fa9 | 12451:4c3e6ff1237e |
---|---|
919 } | 919 } |
920 | 920 |
921 /***********************************/ | 921 /***********************************/ |
922 /* weighted prediction */ | 922 /* weighted prediction */ |
923 | 923 |
924 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) | 924 #define H264_WEIGHT(W, H, OPT) \ |
925 { | 925 void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ |
926 int x, y; | 926 int stride, int log2_denom, int weight, int offset); |
927 offset <<= log2_denom; | 927 |
928 offset += (1 << log2_denom) >> 1; | 928 #define H264_BIWEIGHT(W, H, OPT) \ |
929 __asm__ volatile( | 929 void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \ |
930 "movd %0, %%mm4 \n\t" | 930 uint8_t *src, int stride, int log2_denom, int weightd, \ |
931 "movd %1, %%mm5 \n\t" | 931 int weights, int offset); |
932 "movd %2, %%mm6 \n\t" | 932 |
933 "pshufw $0, %%mm4, %%mm4 \n\t" | 933 #define H264_BIWEIGHT_MMX(W,H) \ |
934 "pshufw $0, %%mm5, %%mm5 \n\t" | 934 H264_WEIGHT (W, H, mmx2) \ |
935 "pxor %%mm7, %%mm7 \n\t" | 935 H264_BIWEIGHT(W, H, mmx2) |
936 :: "g"(weight), "g"(offset), "g"(log2_denom) | 936 |
937 ); | 937 #define H264_BIWEIGHT_MMX_SSE(W,H) \ |
938 for(y=0; y<h; y+=2){ | 938 H264_BIWEIGHT_MMX(W, H) \ |
939 for(x=0; x<w; x+=4){ | 939 H264_WEIGHT (W, H, sse2) \ |
940 __asm__ volatile( | 940 H264_BIWEIGHT (W, H, sse2) \ |
941 "movd %0, %%mm0 \n\t" | 941 H264_BIWEIGHT (W, H, ssse3) |
942 "movd %1, %%mm1 \n\t" | 942 |
943 "punpcklbw %%mm7, %%mm0 \n\t" | 943 H264_BIWEIGHT_MMX_SSE(16, 16) |
944 "punpcklbw %%mm7, %%mm1 \n\t" | 944 H264_BIWEIGHT_MMX_SSE(16, 8) |
945 "pmullw %%mm4, %%mm0 \n\t" | 945 H264_BIWEIGHT_MMX_SSE( 8, 16) |
946 "pmullw %%mm4, %%mm1 \n\t" | 946 H264_BIWEIGHT_MMX_SSE( 8, 8) |
947 "paddsw %%mm5, %%mm0 \n\t" | 947 H264_BIWEIGHT_MMX_SSE( 8, 4) |
948 "paddsw %%mm5, %%mm1 \n\t" | 948 H264_BIWEIGHT_MMX ( 4, 8) |
949 "psraw %%mm6, %%mm0 \n\t" | 949 H264_BIWEIGHT_MMX ( 4, 4) |
950 "psraw %%mm6, %%mm1 \n\t" | 950 H264_BIWEIGHT_MMX ( 4, 2) |
951 "packuswb %%mm7, %%mm0 \n\t" | |
952 "packuswb %%mm7, %%mm1 \n\t" | |
953 "movd %%mm0, %0 \n\t" | |
954 "movd %%mm1, %1 \n\t" | |
955 : "+m"(*(uint32_t*)(dst+x)), | |
956 "+m"(*(uint32_t*)(dst+x+stride)) | |
957 ); | |
958 } | |
959 dst += 2*stride; | |
960 } | |
961 } | |
962 | |
963 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h) | |
964 { | |
965 int x, y; | |
966 offset = ((offset + 1) | 1) << log2_denom; | |
967 __asm__ volatile( | |
968 "movd %0, %%mm3 \n\t" | |
969 "movd %1, %%mm4 \n\t" | |
970 "movd %2, %%mm5 \n\t" | |
971 "movd %3, %%mm6 \n\t" | |
972 "pshufw $0, %%mm3, %%mm3 \n\t" | |
973 "pshufw $0, %%mm4, %%mm4 \n\t" | |
974 "pshufw $0, %%mm5, %%mm5 \n\t" | |
975 "pxor %%mm7, %%mm7 \n\t" | |
976 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) | |
977 ); | |
978 for(y=0; y<h; y++){ | |
979 for(x=0; x<w; x+=4){ | |
980 __asm__ volatile( | |
981 "movd %0, %%mm0 \n\t" | |
982 "movd %1, %%mm1 \n\t" | |
983 "punpcklbw %%mm7, %%mm0 \n\t" | |
984 "punpcklbw %%mm7, %%mm1 \n\t" | |
985 "pmullw %%mm3, %%mm0 \n\t" | |
986 "pmullw %%mm4, %%mm1 \n\t" | |
987 "paddsw %%mm1, %%mm0 \n\t" | |
988 "paddsw %%mm5, %%mm0 \n\t" | |
989 "psraw %%mm6, %%mm0 \n\t" | |
990 "packuswb %%mm0, %%mm0 \n\t" | |
991 "movd %%mm0, %0 \n\t" | |
992 : "+m"(*(uint32_t*)(dst+x)) | |
993 : "m"(*(uint32_t*)(src+x)) | |
994 ); | |
995 } | |
996 src += stride; | |
997 dst += stride; | |
998 } | |
999 } | |
1000 | |
1001 #define H264_WEIGHT(W,H) \ | |
1002 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \ | |
1003 ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \ | |
1004 } \ | |
1005 static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ | |
1006 ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \ | |
1007 } | |
1008 | |
1009 H264_WEIGHT(16,16) | |
1010 H264_WEIGHT(16, 8) | |
1011 H264_WEIGHT( 8,16) | |
1012 H264_WEIGHT( 8, 8) | |
1013 H264_WEIGHT( 8, 4) | |
1014 H264_WEIGHT( 4, 8) | |
1015 H264_WEIGHT( 4, 4) | |
1016 H264_WEIGHT( 4, 2) | |
1017 | |
1018 void ff_h264_biweight_8x8_sse2(uint8_t *dst, uint8_t *src, int stride, | |
1019 int log2_denom, int weightd, int weights, | |
1020 int offset); | |
1021 | |
1022 void ff_h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride, | |
1023 int log2_denom, int weightd, int weights, | |
1024 int offset); | |
1025 | |
1026 void ff_h264_biweight_8x8_ssse3(uint8_t *dst, uint8_t *src, int stride, | |
1027 int log2_denom, int weightd, int weights, | |
1028 int offset); | |
1029 | |
1030 void ff_h264_biweight_16x16_ssse3(uint8_t *dst, uint8_t *src, int stride, | |
1031 int log2_denom, int weightd, int weights, | |
1032 int offset); | |
1033 | 951 |
1034 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | 952 void ff_x264_deblock_v_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
1035 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); | 953 void ff_x264_deblock_h_luma_sse2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0); |
1036 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); | 954 void ff_x264_deblock_h_luma_intra_mmxext(uint8_t *pix, int stride, int alpha, int beta); |
1037 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); | 955 void ff_x264_deblock_v_luma_intra_sse2(uint8_t *pix, int stride, int alpha, int beta); |
1074 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; | 992 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; |
1075 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; | 993 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; |
1076 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; | 994 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; |
1077 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; | 995 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; |
1078 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; | 996 c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; |
1079 | 997 } |
998 if(mm_flags & FF_MM_SSE2){ | |
999 c->h264_idct8_add = ff_h264_idct8_add_sse2; | |
1000 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | |
1001 } | |
1002 | |
1003 #if HAVE_YASM | |
1004 if (mm_flags & FF_MM_MMX2){ | |
1005 #if ARCH_X86_32 | |
1006 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; | |
1007 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; | |
1008 #endif | |
1080 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; | 1009 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; |
1081 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; | 1010 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; |
1082 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; | 1011 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; |
1083 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; | 1012 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; |
1084 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; | 1013 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; |
1092 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; | 1021 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; |
1093 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; | 1022 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; |
1094 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; | 1023 c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; |
1095 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; | 1024 c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; |
1096 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; | 1025 c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; |
1097 } | 1026 |
1098 if(mm_flags & FF_MM_SSE2){ | |
1099 c->h264_idct8_add = ff_h264_idct8_add_sse2; | |
1100 c->h264_idct8_add4= ff_h264_idct8_add4_sse2; | |
1101 } | |
1102 | |
1103 #if HAVE_YASM | |
1104 if (mm_flags & FF_MM_MMX2){ | |
1105 #if ARCH_X86_32 | |
1106 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext; | |
1107 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext; | |
1108 #endif | |
1109 if( mm_flags&FF_MM_SSE2 ){ | 1027 if( mm_flags&FF_MM_SSE2 ){ |
1028 c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2; | |
1029 c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2; | |
1030 c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2; | |
1031 c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2; | |
1032 c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2; | |
1033 | |
1110 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; | 1034 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2; |
1035 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2; | |
1036 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2; | |
1111 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; | 1037 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2; |
1038 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2; | |
1039 | |
1112 #if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110 | 1040 #if ARCH_X86_64 || !defined(__ICC) || __ICC > 1110 |
1113 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; | 1041 c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2; |
1114 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; | 1042 c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2; |
1115 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; | 1043 c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2; |
1116 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; | 1044 c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2; |
1121 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; | 1049 c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2; |
1122 #endif | 1050 #endif |
1123 } | 1051 } |
1124 if ( mm_flags&FF_MM_SSSE3 ){ | 1052 if ( mm_flags&FF_MM_SSSE3 ){ |
1125 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; | 1053 c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3; |
1054 c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3; | |
1055 c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3; | |
1126 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; | 1056 c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3; |
1057 c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3; | |
1127 } | 1058 } |
1128 } | 1059 } |
1129 #endif | 1060 #endif |
1130 } | 1061 } |
1131 } | 1062 } |