Mercurial > libavcodec.hg
comparison x86/vp8dsp.asm @ 12238:1a7903913e9b libavcodec
VP8: 30% faster idct_mb
Take shortcuts based on statically common situations.
Add 4-at-a-time idct_dc function (mmx and sse2) since rows of 4 DC-only DCT
blocks are common.
TODO: tie this more directly into the MB mode, since the DC-level transform is
only used for non-splitmv blocks?
author | darkshikari |
---|---|
date | Fri, 23 Jul 2010 02:58:27 +0000 |
parents | e08d65897115 |
children | 13b1ad24a4b1 |
comparison
equal
deleted
inserted
replaced
12237:f0c4dc49c8f1 | 12238:1a7903913e9b |
---|---|
898 sub r4d, 2 | 898 sub r4d, 2 |
899 jg .nextrow | 899 jg .nextrow |
900 REP_RET | 900 REP_RET |
901 | 901 |
902 ;----------------------------------------------------------------------------- | 902 ;----------------------------------------------------------------------------- |
903 ; IDCT functions: | |
904 ; | |
905 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | 903 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
906 ;----------------------------------------------------------------------------- | 904 ;----------------------------------------------------------------------------- |
907 | 905 |
906 %macro ADD_DC 4 | |
907 %4 m2, [r0+%3] | |
908 %4 m3, [r0+r2+%3] | |
909 %4 m4, [r1+%3] | |
910 %4 m5, [r1+r2+%3] | |
911 paddusb m2, %1 | |
912 paddusb m3, %1 | |
913 paddusb m4, %1 | |
914 paddusb m5, %1 | |
915 psubusb m2, %2 | |
916 psubusb m3, %2 | |
917 psubusb m4, %2 | |
918 psubusb m5, %2 | |
919 %4 [r0+%3], m2 | |
920 %4 [r0+r2+%3], m3 | |
921 %4 [r1+%3], m4 | |
922 %4 [r1+r2+%3], m5 | |
923 %endmacro | |
924 | |
925 INIT_MMX | |
908 cglobal vp8_idct_dc_add_mmx, 3, 3 | 926 cglobal vp8_idct_dc_add_mmx, 3, 3 |
909 ; load data | 927 ; load data |
910 movd mm0, [r1] | 928 movd m0, [r1] |
911 | 929 |
912 ; calculate DC | 930 ; calculate DC |
913 paddw mm0, [pw_4] | 931 paddw m0, [pw_4] |
914 pxor mm1, mm1 | 932 pxor m1, m1 |
915 psraw mm0, 3 | 933 psraw m0, 3 |
916 movd [r1], mm1 | 934 movd [r1], m1 |
917 psubw mm1, mm0 | 935 psubw m1, m0 |
918 packuswb mm0, mm0 | 936 packuswb m0, m0 |
919 packuswb mm1, mm1 | 937 packuswb m1, m1 |
920 punpcklbw mm0, mm0 | 938 punpcklbw m0, m0 |
921 punpcklbw mm1, mm1 | 939 punpcklbw m1, m1 |
922 punpcklwd mm0, mm0 | 940 punpcklwd m0, m0 |
923 punpcklwd mm1, mm1 | 941 punpcklwd m1, m1 |
924 | 942 |
925 ; add DC | 943 ; add DC |
926 lea r1, [r0+r2*2] | 944 lea r1, [r0+r2*2] |
927 movd mm2, [r0] | 945 ADD_DC m0, m1, 0, movh |
928 movd mm3, [r0+r2] | |
929 movd mm4, [r1] | |
930 movd mm5, [r1+r2] | |
931 paddusb mm2, mm0 | |
932 paddusb mm3, mm0 | |
933 paddusb mm4, mm0 | |
934 paddusb mm5, mm0 | |
935 psubusb mm2, mm1 | |
936 psubusb mm3, mm1 | |
937 psubusb mm4, mm1 | |
938 psubusb mm5, mm1 | |
939 movd [r0], mm2 | |
940 movd [r0+r2], mm3 | |
941 movd [r1], mm4 | |
942 movd [r1+r2], mm5 | |
943 RET | 946 RET |
944 | 947 |
948 INIT_XMM | |
945 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 | 949 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
946 ; load data | 950 ; load data |
947 movd xmm0, [r1] | 951 movd m0, [r1] |
948 pxor xmm1, xmm1 | 952 pxor m1, m1 |
949 | 953 |
950 ; calculate DC | 954 ; calculate DC |
951 paddw xmm0, [pw_4] | 955 paddw m0, [pw_4] |
952 movd [r1], xmm1 | 956 movd [r1], m1 |
953 lea r1, [r0+r2*2] | 957 lea r1, [r0+r2*2] |
954 movd xmm2, [r0] | 958 movd m2, [r0] |
955 movd xmm3, [r0+r2] | 959 movd m3, [r0+r2] |
956 movd xmm4, [r1] | 960 movd m4, [r1] |
957 movd xmm5, [r1+r2] | 961 movd m5, [r1+r2] |
958 psraw xmm0, 3 | 962 psraw m0, 3 |
959 pshuflw xmm0, xmm0, 0 | 963 pshuflw m0, m0, 0 |
960 punpcklqdq xmm0, xmm0 | 964 punpcklqdq m0, m0 |
961 punpckldq xmm2, xmm3 | 965 punpckldq m2, m3 |
962 punpckldq xmm4, xmm5 | 966 punpckldq m4, m5 |
963 punpcklbw xmm2, xmm1 | 967 punpcklbw m2, m1 |
964 punpcklbw xmm4, xmm1 | 968 punpcklbw m4, m1 |
965 paddw xmm2, xmm0 | 969 paddw m2, m0 |
966 paddw xmm4, xmm0 | 970 paddw m4, m0 |
967 packuswb xmm2, xmm4 | 971 packuswb m2, m4 |
968 movd [r0], xmm2 | 972 movd [r0], m2 |
969 pextrd [r0+r2], xmm2, 1 | 973 pextrd [r0+r2], m2, 1 |
970 pextrd [r1], xmm2, 2 | 974 pextrd [r1], m2, 2 |
971 pextrd [r1+r2], xmm2, 3 | 975 pextrd [r1+r2], m2, 3 |
976 RET | |
977 | |
978 ;----------------------------------------------------------------------------- | |
979 ; void vp8_idct_dc_add4_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); | |
980 ;----------------------------------------------------------------------------- | |
981 | |
982 INIT_MMX | |
983 cglobal vp8_idct_dc_add4_mmx, 3, 3 | |
984 ; load data | |
985 movd m0, [r1+32*0] ; A | |
986 movd m1, [r1+32*2] ; C | |
987 punpcklwd m0, [r1+32*1] ; A B | |
988 punpcklwd m1, [r1+32*3] ; C D | |
989 punpckldq m0, m1 ; A B C D | |
990 pxor m6, m6 | |
991 | |
992 ; calculate DC | |
993 paddw m0, [pw_4] | |
994 movd [r1+32*0], m6 | |
995 movd [r1+32*1], m6 | |
996 movd [r1+32*2], m6 | |
997 movd [r1+32*3], m6 | |
998 psraw m0, 3 | |
999 psubw m6, m0 | |
1000 packuswb m0, m0 | |
1001 packuswb m6, m6 | |
1002 punpcklbw m0, m0 ; AABBCCDD | |
1003 punpcklbw m6, m6 ; AABBCCDD | |
1004 movq m1, m0 | |
1005 movq m7, m6 | |
1006 punpcklbw m0, m0 ; AAAABBBB | |
1007 punpckhbw m1, m1 ; CCCCDDDD | |
1008 punpcklbw m6, m6 ; AAAABBBB | |
1009 punpckhbw m7, m7 ; CCCCDDDD | |
1010 | |
1011 ; add DC | |
1012 lea r1, [r0+r2*2] | |
1013 ADD_DC m0, m6, 0, mova | |
1014 ADD_DC m1, m7, 8, mova | |
1015 RET | |
1016 | |
1017 INIT_XMM | |
1018 cglobal vp8_idct_dc_add4_sse2, 3, 3 | |
1019 ; load data | |
1020 movd m0, [r1+32*0] ; A | |
1021 movd m1, [r1+32*2] ; C | |
1022 punpcklwd m0, [r1+32*1] ; A B | |
1023 punpcklwd m1, [r1+32*3] ; C D | |
1024 punpckldq m0, m1 ; A B C D | |
1025 pxor m1, m1 | |
1026 | |
1027 ; calculate DC | |
1028 paddw m0, [pw_4] | |
1029 movd [r1+32*0], m1 | |
1030 movd [r1+32*1], m1 | |
1031 movd [r1+32*2], m1 | |
1032 movd [r1+32*3], m1 | |
1033 psraw m0, 3 | |
1034 psubw m1, m0 | |
1035 packuswb m0, m0 | |
1036 packuswb m1, m1 | |
1037 punpcklbw m0, m0 | |
1038 punpcklbw m1, m1 | |
1039 punpcklbw m0, m0 | |
1040 punpcklbw m1, m1 | |
1041 | |
1042 ; add DC | |
1043 lea r1, [r0+r2*2] | |
1044 ADD_DC m0, m1, 0, mova | |
972 RET | 1045 RET |
973 | 1046 |
974 ;----------------------------------------------------------------------------- | 1047 ;----------------------------------------------------------------------------- |
975 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | 1048 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
976 ;----------------------------------------------------------------------------- | 1049 ;----------------------------------------------------------------------------- |