comparison x86/vp8dsp.asm @ 12238:1a7903913e9b libavcodec

VP8: 30% faster idct_mb Take shortcuts based on statically common situations. Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT blocks are common. TODO: tie this more directly into the MB mode, since the DC-level transform is only used for non-splitmv blocks?
author darkshikari
date Fri, 23 Jul 2010 02:58:27 +0000
parents e08d65897115
children 13b1ad24a4b1
comparison
equal deleted inserted replaced
12237:f0c4dc49c8f1 12238:1a7903913e9b
898 sub r4d, 2 898 sub r4d, 2
899 jg .nextrow 899 jg .nextrow
900 REP_RET 900 REP_RET
901 901
902 ;----------------------------------------------------------------------------- 902 ;-----------------------------------------------------------------------------
903 ; IDCT functions:
904 ;
905 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); 903 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
906 ;----------------------------------------------------------------------------- 904 ;-----------------------------------------------------------------------------
907 905
906 %macro ADD_DC 4
907 %4 m2, [r0+%3]
908 %4 m3, [r0+r2+%3]
909 %4 m4, [r1+%3]
910 %4 m5, [r1+r2+%3]
911 paddusb m2, %1
912 paddusb m3, %1
913 paddusb m4, %1
914 paddusb m5, %1
915 psubusb m2, %2
916 psubusb m3, %2
917 psubusb m4, %2
918 psubusb m5, %2
919 %4 [r0+%3], m2
920 %4 [r0+r2+%3], m3
921 %4 [r1+%3], m4
922 %4 [r1+r2+%3], m5
923 %endmacro
924
925 INIT_MMX
908 cglobal vp8_idct_dc_add_mmx, 3, 3 926 cglobal vp8_idct_dc_add_mmx, 3, 3
909 ; load data 927 ; load data
910 movd mm0, [r1] 928 movd m0, [r1]
911 929
912 ; calculate DC 930 ; calculate DC
913 paddw mm0, [pw_4] 931 paddw m0, [pw_4]
914 pxor mm1, mm1 932 pxor m1, m1
915 psraw mm0, 3 933 psraw m0, 3
916 movd [r1], mm1 934 movd [r1], m1
917 psubw mm1, mm0 935 psubw m1, m0
918 packuswb mm0, mm0 936 packuswb m0, m0
919 packuswb mm1, mm1 937 packuswb m1, m1
920 punpcklbw mm0, mm0 938 punpcklbw m0, m0
921 punpcklbw mm1, mm1 939 punpcklbw m1, m1
922 punpcklwd mm0, mm0 940 punpcklwd m0, m0
923 punpcklwd mm1, mm1 941 punpcklwd m1, m1
924 942
925 ; add DC 943 ; add DC
926 lea r1, [r0+r2*2] 944 lea r1, [r0+r2*2]
927 movd mm2, [r0] 945 ADD_DC m0, m1, 0, movh
928 movd mm3, [r0+r2]
929 movd mm4, [r1]
930 movd mm5, [r1+r2]
931 paddusb mm2, mm0
932 paddusb mm3, mm0
933 paddusb mm4, mm0
934 paddusb mm5, mm0
935 psubusb mm2, mm1
936 psubusb mm3, mm1
937 psubusb mm4, mm1
938 psubusb mm5, mm1
939 movd [r0], mm2
940 movd [r0+r2], mm3
941 movd [r1], mm4
942 movd [r1+r2], mm5
943 RET 946 RET
944 947
948 INIT_XMM
945 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 949 cglobal vp8_idct_dc_add_sse4, 3, 3, 6
946 ; load data 950 ; load data
947 movd xmm0, [r1] 951 movd m0, [r1]
948 pxor xmm1, xmm1 952 pxor m1, m1
949 953
950 ; calculate DC 954 ; calculate DC
951 paddw xmm0, [pw_4] 955 paddw m0, [pw_4]
952 movd [r1], xmm1 956 movd [r1], m1
953 lea r1, [r0+r2*2] 957 lea r1, [r0+r2*2]
954 movd xmm2, [r0] 958 movd m2, [r0]
955 movd xmm3, [r0+r2] 959 movd m3, [r0+r2]
956 movd xmm4, [r1] 960 movd m4, [r1]
957 movd xmm5, [r1+r2] 961 movd m5, [r1+r2]
958 psraw xmm0, 3 962 psraw m0, 3
959 pshuflw xmm0, xmm0, 0 963 pshuflw m0, m0, 0
960 punpcklqdq xmm0, xmm0 964 punpcklqdq m0, m0
961 punpckldq xmm2, xmm3 965 punpckldq m2, m3
962 punpckldq xmm4, xmm5 966 punpckldq m4, m5
963 punpcklbw xmm2, xmm1 967 punpcklbw m2, m1
964 punpcklbw xmm4, xmm1 968 punpcklbw m4, m1
965 paddw xmm2, xmm0 969 paddw m2, m0
966 paddw xmm4, xmm0 970 paddw m4, m0
967 packuswb xmm2, xmm4 971 packuswb m2, m4
968 movd [r0], xmm2 972 movd [r0], m2
969 pextrd [r0+r2], xmm2, 1 973 pextrd [r0+r2], m2, 1
970 pextrd [r1], xmm2, 2 974 pextrd [r1], m2, 2
971 pextrd [r1+r2], xmm2, 3 975 pextrd [r1+r2], m2, 3
976 RET
977
978 ;-----------------------------------------------------------------------------
979 ; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
980 ;-----------------------------------------------------------------------------
981
982 INIT_MMX
983 cglobal vp8_idct_dc_add4_mmx, 3, 3
984 ; load data
985 movd m0, [r1+32*0] ; A
986 movd m1, [r1+32*2] ; C
987 punpcklwd m0, [r1+32*1] ; A B
988 punpcklwd m1, [r1+32*3] ; C D
989 punpckldq m0, m1 ; A B C D
990 pxor m6, m6
991
992 ; calculate DC
993 paddw m0, [pw_4]
994 movd [r1+32*0], m6
995 movd [r1+32*1], m6
996 movd [r1+32*2], m6
997 movd [r1+32*3], m6
998 psraw m0, 3
999 psubw m6, m0
1000 packuswb m0, m0
1001 packuswb m6, m6
1002 punpcklbw m0, m0 ; AABBCCDD
1003 punpcklbw m6, m6 ; AABBCCDD
1004 movq m1, m0
1005 movq m7, m6
1006 punpcklbw m0, m0 ; AAAABBBB
1007 punpckhbw m1, m1 ; CCCCDDDD
1008 punpcklbw m6, m6 ; AAAABBBB
1009 punpckhbw m7, m7 ; CCCCDDDD
1010
1011 ; add DC
1012 lea r1, [r0+r2*2]
1013 ADD_DC m0, m6, 0, mova
1014 ADD_DC m1, m7, 8, mova
1015 RET
1016
1017 INIT_XMM
1018 cglobal vp8_idct_dc_add4_sse2, 3, 3
1019 ; load data
1020 movd m0, [r1+32*0] ; A
1021 movd m1, [r1+32*2] ; C
1022 punpcklwd m0, [r1+32*1] ; A B
1023 punpcklwd m1, [r1+32*3] ; C D
1024 punpckldq m0, m1 ; A B C D
1025 pxor m1, m1
1026
1027 ; calculate DC
1028 paddw m0, [pw_4]
1029 movd [r1+32*0], m1
1030 movd [r1+32*1], m1
1031 movd [r1+32*2], m1
1032 movd [r1+32*3], m1
1033 psraw m0, 3
1034 psubw m1, m0
1035 packuswb m0, m0
1036 packuswb m1, m1
1037 punpcklbw m0, m0
1038 punpcklbw m1, m1
1039 punpcklbw m0, m0
1040 punpcklbw m1, m1
1041
1042 ; add DC
1043 lea r1, [r0+r2*2]
1044 ADD_DC m0, m1, 0, mova
972 RET 1045 RET
973 1046
974 ;----------------------------------------------------------------------------- 1047 ;-----------------------------------------------------------------------------
975 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); 1048 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
976 ;----------------------------------------------------------------------------- 1049 ;-----------------------------------------------------------------------------